mirror of
https://github.com/roytam1/UXP.git
synced 2026-05-26 14:54:25 +00:00
Merge remote-tracking branch 'origin/custom' into custom-platform
This commit is contained in:
@@ -1 +1 @@
|
||||
28.10.6a1
|
||||
28.10.7a1
|
||||
@@ -157,6 +157,7 @@ def old_configure_options(*options):
|
||||
'--enable-address-sanitizer',
|
||||
'--enable-alsa',
|
||||
'--enable-av1',
|
||||
'--enable-jxl',
|
||||
'--enable-b2g-bt',
|
||||
'--enable-b2g-camera',
|
||||
'--enable-b2g-ril',
|
||||
|
||||
@@ -334,8 +334,10 @@ def lib_path(target, vc_path, windows_sdk_dir, ucrt_sdk_dir, dia_sdk_dir):
|
||||
|
||||
atlmfc_dir = os.path.join(vc_path, 'atlmfc', 'lib', *vc_target)
|
||||
if not os.path.isdir(atlmfc_dir):
|
||||
die('Cannot find the ATL/MFC libraries in the Visual C++ directory (%s). '
|
||||
'Please install them.' % vc_path)
|
||||
atlmfc_dir = atlmfc_dir.replace('amd64','x64')
|
||||
if not os.path.isdir(atlmfc_dir):
|
||||
die('Cannot find the ATL/MFC libraries in the Visual C++ directory (%s). '
|
||||
'Please install them.' % vc_path)
|
||||
|
||||
|
||||
libs = []
|
||||
|
||||
Vendored
+3
@@ -42,6 +42,9 @@ if CONFIG['CPU_ARCH'] == 'arm':
|
||||
if CONFIG['MOZ_FFVPX']:
|
||||
external_dirs += ['media/ffvpx']
|
||||
|
||||
if CONFIG["MOZ_JXL"]:
|
||||
external_dirs += ["media/libjxl", "media/highway"]
|
||||
|
||||
external_dirs += [
|
||||
'media/kiss_fft',
|
||||
'media/libcubeb',
|
||||
|
||||
@@ -10,4 +10,4 @@
|
||||
# hardcoded milestones in the tree from these two files.
|
||||
#--------------------------------------------------------
|
||||
|
||||
4.8.6
|
||||
4.8.7
|
||||
@@ -17,12 +17,6 @@
|
||||
# define MOZ_INCLUDE_MOZALLOC_H_FROM_${HEADER}
|
||||
#endif
|
||||
|
||||
// Code built with !_HAS_EXCEPTIONS calls std::_Throw(), but the win2k
|
||||
// CRT doesn't export std::_Throw(). So we define it.
|
||||
#ifndef mozilla_Throw_h
|
||||
# include "mozilla/throw_msvc.h"
|
||||
#endif
|
||||
|
||||
#ifdef _DEBUG
|
||||
// From
|
||||
// http://msdn.microsoft.com/en-us/library/aa985982%28VS.80%29.aspx
|
||||
|
||||
+24801
-12689
File diff suppressed because it is too large
Load Diff
+896
-181
File diff suppressed because it is too large
Load Diff
@@ -448,6 +448,9 @@ private:
|
||||
DECL_GFX_PREF(Live, "image.mozsamplesize.enabled", ImageMozSampleSizeEnabled, bool, false);
|
||||
DECL_GFX_PREF(Once, "image.multithreaded_decoding.limit", ImageMTDecodingLimit, int32_t, -1);
|
||||
DECL_GFX_PREF(Live, "image.webp.enabled", ImageWebPEnabled, bool, true);
|
||||
#ifdef MOZ_JXL
|
||||
DECL_GFX_PREF(Live, "image.jxl.enabled", ImageJXLEnabled, bool, true);
|
||||
#endif
|
||||
|
||||
DECL_GFX_PREF(Once, "layers.acceleration.enabled", LayersAccelerationEnabledDoNotUseDirectly, bool, true);
|
||||
DECL_GFX_PREF(Live, "layers.acceleration.draw-fps", LayersDrawFPS, bool, false);
|
||||
|
||||
@@ -19,6 +19,9 @@
|
||||
#include "nsICODecoder.h"
|
||||
#include "nsIconDecoder.h"
|
||||
#include "nsWebPDecoder.h"
|
||||
#ifdef MOZ_JXL
|
||||
# include "nsJXLDecoder.h"
|
||||
#endif
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
@@ -77,6 +80,12 @@ DecoderFactory::GetDecoderType(const char* aMimeType)
|
||||
gfxPrefs::ImageWebPEnabled()) {
|
||||
type = DecoderType::WEBP;
|
||||
}
|
||||
#ifdef MOZ_JXL
|
||||
else if (!strcmp(aMimeType, IMAGE_JXL) &&
|
||||
gfxPrefs::ImageJXLEnabled()) {
|
||||
type = DecoderType::JXL;
|
||||
}
|
||||
#endif
|
||||
return type;
|
||||
}
|
||||
|
||||
@@ -116,6 +125,11 @@ DecoderFactory::GetDecoder(DecoderType aType,
|
||||
case DecoderType::WEBP:
|
||||
decoder = new nsWebPDecoder(aImage);
|
||||
break;
|
||||
#ifdef MOZ_JXL
|
||||
case DecoderType::JXL:
|
||||
decoder = new nsJXLDecoder(aImage);
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
MOZ_ASSERT_UNREACHABLE("Unknown decoder type");
|
||||
}
|
||||
@@ -188,8 +202,15 @@ DecoderFactory::CreateAnimationDecoder(DecoderType aType,
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
MOZ_ASSERT(aType == DecoderType::GIF || aType == DecoderType::PNG ||
|
||||
aType == DecoderType::WEBP,
|
||||
bool validDecoderType = (
|
||||
aType == DecoderType::GIF ||
|
||||
aType == DecoderType::PNG ||
|
||||
aType == DecoderType::WEBP);
|
||||
#ifdef MOZ_JXL
|
||||
validDecoderType = validDecoderType || aType == DecoderType::JXL;
|
||||
#endif
|
||||
|
||||
MOZ_ASSERT(validDecoderType,
|
||||
"Calling CreateAnimationDecoder for non-animating DecoderType");
|
||||
|
||||
// Create an anonymous decoder. Interaction with the SurfaceCache and the
|
||||
|
||||
@@ -38,6 +38,7 @@ enum class DecoderType
|
||||
ICO,
|
||||
ICON,
|
||||
WEBP,
|
||||
JXL,
|
||||
UNKNOWN
|
||||
};
|
||||
|
||||
|
||||
@@ -83,6 +83,7 @@ static const mozilla::Module::CategoryEntry kImageCategories[] = {
|
||||
{ "Gecko-Content-Viewers", IMAGE_APNG, "@mozilla.org/content/document-loader-factory;1" },
|
||||
{ "Gecko-Content-Viewers", IMAGE_X_PNG, "@mozilla.org/content/document-loader-factory;1" },
|
||||
{ "Gecko-Content-Viewers", IMAGE_WEBP, "@mozilla.org/content/document-loader-factory;1" },
|
||||
{ "Gecko-Content-Viewers", IMAGE_JXL, "@mozilla.org/content/document-loader-factory;1" },
|
||||
{ "content-sniffing-services", "@mozilla.org/image/loader;1", "@mozilla.org/image/loader;1" },
|
||||
{ nullptr }
|
||||
};
|
||||
|
||||
@@ -28,6 +28,11 @@ UNIFIED_SOURCES += [
|
||||
'nsWebPDecoder.cpp',
|
||||
]
|
||||
|
||||
if CONFIG["MOZ_JXL"]:
|
||||
UNIFIED_SOURCES += [
|
||||
"nsJXLDecoder.cpp",
|
||||
]
|
||||
|
||||
include('/ipc/chromium/chromium-config.mozbuild')
|
||||
|
||||
LOCAL_INCLUDES += [
|
||||
|
||||
@@ -0,0 +1,278 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "ImageLogging.h" // Must appear first
|
||||
#include "gfxPlatform.h"
|
||||
#include "jxl/codestream_header.h"
|
||||
#include "jxl/decode_cxx.h"
|
||||
#include "jxl/types.h"
|
||||
#include "mozilla/gfx/Point.h"
|
||||
#include "nsJXLDecoder.h"
|
||||
|
||||
#include "RasterImage.h"
|
||||
#include "SurfacePipeFactory.h"
|
||||
|
||||
using namespace mozilla::gfx;
|
||||
|
||||
namespace mozilla {
|
||||
namespace image {
|
||||
|
||||
#define JXL_TRY(expr) \
|
||||
do { \
|
||||
JxlDecoderStatus status = (expr); \
|
||||
if (status != JXL_DEC_SUCCESS) { \
|
||||
return Transition::TerminateFailure(); \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
#define JXL_TRY_BOOL(expr) \
|
||||
do { \
|
||||
bool succeeded = (expr); \
|
||||
if (!succeeded) { \
|
||||
return Transition::TerminateFailure(); \
|
||||
} \
|
||||
} while (0);
|
||||
|
||||
static LazyLogModule sJXLLog("JXLDecoder");
|
||||
|
||||
nsJXLDecoder::nsJXLDecoder(RasterImage* aImage)
|
||||
: Decoder(aImage),
|
||||
mLexer(Transition::ToUnbuffered(State::FINISHED_JXL_DATA, State::JXL_DATA,
|
||||
SIZE_MAX),
|
||||
Transition::TerminateSuccess()),
|
||||
mDecoder(JxlDecoderMake(nullptr)),
|
||||
mParallelRunner(
|
||||
JxlThreadParallelRunnerMake(nullptr, PreferredThreadCount())),
|
||||
mNumFrames(0),
|
||||
mTimeout(FrameTimeout::Forever()),
|
||||
mContinue(false) {
|
||||
JxlDecoderSubscribeEvents(mDecoder.get(),
|
||||
JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
|
||||
JXL_DEC_FULL_IMAGE);
|
||||
JxlDecoderSetParallelRunner(mDecoder.get(), JxlThreadParallelRunner,
|
||||
mParallelRunner.get());
|
||||
|
||||
MOZ_LOG(sJXLLog, LogLevel::Debug,
|
||||
("[this=%p] nsJXLDecoder::nsJXLDecoder", this));
|
||||
}
|
||||
|
||||
nsJXLDecoder::~nsJXLDecoder() {
|
||||
MOZ_LOG(sJXLLog, LogLevel::Debug,
|
||||
("[this=%p] nsJXLDecoder::~nsJXLDecoder", this));
|
||||
}
|
||||
|
||||
size_t nsJXLDecoder::PreferredThreadCount() {
|
||||
if (IsMetadataDecode()) {
|
||||
return 0; // no additional worker thread
|
||||
}
|
||||
return JxlThreadParallelRunnerDefaultNumWorkerThreads();
|
||||
}
|
||||
|
||||
LexerResult
|
||||
nsJXLDecoder::DoDecode(SourceBufferIterator& aIterator, IResumable* aOnResume)
|
||||
{
|
||||
// return LexerResult(TerminalState::FAILURE);
|
||||
MOZ_ASSERT(!HasError(), "Shouldn't call DoDecode after error!");
|
||||
|
||||
return mLexer.Lex(aIterator, aOnResume,
|
||||
[=](State aState, const char* aData, size_t aLength) {
|
||||
switch (aState) {
|
||||
case State::JXL_DATA:
|
||||
return ReadJXLData(aData, aLength);
|
||||
case State::FINISHED_JXL_DATA:
|
||||
return FinishedJXLData();
|
||||
}
|
||||
MOZ_CRASH("Unknown State");
|
||||
});
|
||||
};
|
||||
|
||||
NextPixel<uint32_t>
|
||||
nsJXLDecoder::PackRGBAPixelAndAdvance(uint8_t*& aRawPixelInOut)
|
||||
{
|
||||
const uint32_t pixel =
|
||||
gfxPackedPixel(aRawPixelInOut[3], aRawPixelInOut[0],
|
||||
aRawPixelInOut[1], aRawPixelInOut[2]);
|
||||
aRawPixelInOut += 4;
|
||||
return AsVariant(pixel);
|
||||
}
|
||||
|
||||
LexerTransition<nsJXLDecoder::State>
|
||||
nsJXLDecoder::ReadJXLData(const char* aData, size_t aLength)
|
||||
{
|
||||
// Ignore data we have already read.
|
||||
// This will only occur as a result of a yield for animation.
|
||||
if (!mContinue) {
|
||||
const uint8_t* input = (const uint8_t*)aData;
|
||||
size_t length = aLength;
|
||||
if (mBuffer.length() != 0) {
|
||||
JXL_TRY_BOOL(mBuffer.append(aData, aLength));
|
||||
input = mBuffer.begin();
|
||||
length = mBuffer.length();
|
||||
}
|
||||
JXL_TRY(JxlDecoderSetInput(mDecoder.get(), input, length));
|
||||
}
|
||||
mContinue = false;
|
||||
|
||||
while (true) {
|
||||
JxlDecoderStatus status = JxlDecoderProcessInput(mDecoder.get());
|
||||
switch (status) {
|
||||
case JXL_DEC_ERROR:
|
||||
default:
|
||||
return Transition::TerminateFailure();
|
||||
|
||||
case JXL_DEC_NEED_MORE_INPUT: {
|
||||
size_t remaining = JxlDecoderReleaseInput(mDecoder.get());
|
||||
mBuffer.clear();
|
||||
JXL_TRY_BOOL(mBuffer.append(aData + aLength - remaining, remaining));
|
||||
|
||||
if (mNumFrames == 0 && InFrame()) {
|
||||
// If an image was flushed by JxlDecoderFlushImage, then we know that
|
||||
// JXL_DEC_FRAME has already been run and there is a pipe.
|
||||
if (JxlDecoderFlushImage(mDecoder.get()) == JXL_DEC_SUCCESS) {
|
||||
// A full frame partial image is written to the buffer.
|
||||
mPipe.ResetToFirstRow();
|
||||
for (uint8_t* rowPtr = mOutBuffer.begin();
|
||||
rowPtr < mOutBuffer.end(); rowPtr += mInfo.xsize * 4) {
|
||||
mPipe.WritePixels<uint32_t>([&]{
|
||||
return PackRGBAPixelAndAdvance(rowPtr);
|
||||
});
|
||||
}
|
||||
|
||||
if (Maybe<SurfaceInvalidRect> invalidRect =
|
||||
mPipe.TakeInvalidRect()) {
|
||||
PostInvalidation(invalidRect->mInputSpaceRect,
|
||||
Some(invalidRect->mOutputSpaceRect));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Transition::ContinueUnbuffered(State::JXL_DATA);
|
||||
}
|
||||
|
||||
case JXL_DEC_BASIC_INFO: {
|
||||
JXL_TRY(JxlDecoderGetBasicInfo(mDecoder.get(), &mInfo));
|
||||
PostSize(mInfo.xsize, mInfo.ysize);
|
||||
if (mInfo.alpha_bits > 0) {
|
||||
PostHasTransparency();
|
||||
}
|
||||
if (!mInfo.have_animation && IsMetadataDecode()) {
|
||||
return Transition::TerminateSuccess();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case JXL_DEC_FRAME: {
|
||||
if (mInfo.have_animation) {
|
||||
JXL_TRY(JxlDecoderGetFrameHeader(mDecoder.get(), &mFrameHeader));
|
||||
int32_t duration = (int32_t)(1000.0 * mFrameHeader.duration *
|
||||
mInfo.animation.tps_denominator /
|
||||
mInfo.animation.tps_numerator);
|
||||
|
||||
mTimeout = FrameTimeout::FromRawMilliseconds(duration);
|
||||
|
||||
if (!HasAnimation()) {
|
||||
PostIsAnimated(mTimeout);
|
||||
}
|
||||
}
|
||||
|
||||
bool is_last = mInfo.have_animation ? mFrameHeader.is_last : true;
|
||||
MOZ_LOG(sJXLLog, LogLevel::Debug,
|
||||
("[this=%p] nsJXLDecoder::ReadJXLData - frame %d, is_last %d, "
|
||||
"metadata decode %d, first frame decode %d\n",
|
||||
this, mNumFrames, is_last, IsMetadataDecode(),
|
||||
IsFirstFrameDecode()));
|
||||
|
||||
if (IsMetadataDecode()) {
|
||||
return Transition::TerminateSuccess();
|
||||
}
|
||||
|
||||
Maybe<AnimationParams> animParams;
|
||||
if (!IsFirstFrameDecode()) {
|
||||
animParams.emplace(AnimationParams {
|
||||
FullFrame().ToUnknownRect(), mTimeout, mNumFrames,
|
||||
BlendMethod::SOURCE, DisposalMethod::CLEAR
|
||||
});
|
||||
}
|
||||
|
||||
SurfacePipeFlags pipeFlags = SurfacePipeFlags();
|
||||
|
||||
if (mNumFrames == 0) {
|
||||
// The first frame may be displayed progressively.
|
||||
pipeFlags |= SurfacePipeFlags::PROGRESSIVE_DISPLAY;
|
||||
}
|
||||
|
||||
Maybe<SurfacePipe> pipe = SurfacePipeFactory::CreateSurfacePipe(
|
||||
this, Size(), OutputSize(), FullFrame(), SurfaceFormat::B8G8R8A8,
|
||||
animParams, pipeFlags);
|
||||
|
||||
if (!pipe) {
|
||||
MOZ_LOG(sJXLLog, LogLevel::Debug,
|
||||
("[this=%p] nsJXLDecoder::ReadJXLData - no pipe\n", this));
|
||||
return Transition::TerminateFailure();
|
||||
}
|
||||
|
||||
mPipe = std::move(*pipe);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case JXL_DEC_NEED_IMAGE_OUT_BUFFER: {
|
||||
size_t size = 0;
|
||||
JxlPixelFormat format{4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
|
||||
JXL_TRY(JxlDecoderImageOutBufferSize(mDecoder.get(), &format, &size));
|
||||
|
||||
mOutBuffer.clear();
|
||||
JXL_TRY_BOOL(mOutBuffer.growBy(size));
|
||||
JXL_TRY(JxlDecoderSetImageOutBuffer(mDecoder.get(), &format,
|
||||
mOutBuffer.begin(), size));
|
||||
break;
|
||||
}
|
||||
|
||||
case JXL_DEC_FULL_IMAGE: {
|
||||
mPipe.ResetToFirstRow();
|
||||
for (uint8_t* rowPtr = mOutBuffer.begin(); rowPtr < mOutBuffer.end();
|
||||
rowPtr += mInfo.xsize * 4) {
|
||||
mPipe.WritePixels<uint32_t>([&]{
|
||||
return PackRGBAPixelAndAdvance(rowPtr);
|
||||
});
|
||||
}
|
||||
|
||||
if (Maybe<SurfaceInvalidRect> invalidRect = mPipe.TakeInvalidRect()) {
|
||||
PostInvalidation(invalidRect->mInputSpaceRect,
|
||||
Some(invalidRect->mOutputSpaceRect));
|
||||
}
|
||||
|
||||
PostFrameStop();
|
||||
|
||||
if (!IsFirstFrameDecode() && mInfo.have_animation &&
|
||||
!mFrameHeader.is_last) {
|
||||
mNumFrames++;
|
||||
mContinue = true;
|
||||
// Notify for a new frame but there may be data in the current buffer
|
||||
// that can immediately be processed.
|
||||
return Transition::ToAfterYield(State::JXL_DATA);
|
||||
}
|
||||
MOZ_FALLTHROUGH; // We are done.
|
||||
}
|
||||
|
||||
case JXL_DEC_SUCCESS: {
|
||||
PostDecodeDone(HasAnimation() ? (int32_t)mInfo.animation.num_loops - 1
|
||||
: 0);
|
||||
return Transition::TerminateSuccess();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LexerTransition<nsJXLDecoder::State>
|
||||
nsJXLDecoder::FinishedJXLData()
|
||||
{
|
||||
MOZ_ASSERT_UNREACHABLE("Read the entire address space?");
|
||||
return Transition::TerminateFailure();
|
||||
}
|
||||
|
||||
} // namespace image
|
||||
} // namespace mozilla
|
||||
@@ -0,0 +1,62 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef mozilla_image_decoders_nsJXLDecoder_h
|
||||
#define mozilla_image_decoders_nsJXLDecoder_h
|
||||
|
||||
#include "Decoder.h"
|
||||
#include "SurfacePipe.h"
|
||||
|
||||
#include "jxl/decode_cxx.h"
|
||||
#include "jxl/thread_parallel_runner_cxx.h"
|
||||
|
||||
namespace mozilla {
|
||||
namespace image {
|
||||
class RasterImage;
|
||||
|
||||
class nsJXLDecoder final : public Decoder {
|
||||
public:
|
||||
virtual ~nsJXLDecoder();
|
||||
|
||||
protected:
|
||||
LexerResult DoDecode(SourceBufferIterator& aIterator,
|
||||
IResumable* aOnResume) override;
|
||||
|
||||
private:
|
||||
friend class DecoderFactory;
|
||||
|
||||
// Decoders should only be instantiated via DecoderFactory.
|
||||
explicit nsJXLDecoder(RasterImage* aImage);
|
||||
|
||||
size_t PreferredThreadCount();
|
||||
|
||||
enum class State { JXL_DATA, FINISHED_JXL_DATA };
|
||||
|
||||
// Copied from nsPNGDecoder with the same name. Handles R&B channels
|
||||
// as well as alpha premultiplication for us. See Issue #2057.
|
||||
NextPixel<uint32_t> PackRGBAPixelAndAdvance(uint8_t*& aRawPixelInOut);
|
||||
|
||||
LexerTransition<State> ReadJXLData(const char* aData, size_t aLength);
|
||||
LexerTransition<State> FinishedJXLData();
|
||||
|
||||
StreamingLexer<State> mLexer;
|
||||
JxlDecoderPtr mDecoder;
|
||||
JxlThreadParallelRunnerPtr mParallelRunner;
|
||||
Vector<uint8_t> mBuffer;
|
||||
Vector<uint8_t> mOutBuffer;
|
||||
JxlBasicInfo mInfo{};
|
||||
JxlFrameHeader mFrameHeader;
|
||||
|
||||
uint32_t mNumFrames;
|
||||
FrameTimeout mTimeout;
|
||||
SurfacePipe mPipe;
|
||||
bool mContinue;
|
||||
};
|
||||
|
||||
} // namespace image
|
||||
} // namespace mozilla
|
||||
|
||||
#endif // mozilla_image_decoders_nsJXLDecoder_h
|
||||
@@ -2550,6 +2550,12 @@ imgLoader::GetMimeTypeFromContent(const char* aContents,
|
||||
!memcmp(aContents + 8, "WEBP", 4)) {
|
||||
aContentType.AssignLiteral(IMAGE_WEBP);
|
||||
|
||||
} else if ((aLength >= 2 && !memcmp(aContents, "\xFF\x0A", 2)) ||
|
||||
(aLength >= 12 &&
|
||||
!memcmp(aContents, "\x00\x00\x00\x0CJXL \x0D\x0A\x87\x0A", 12))) {
|
||||
// Each version is for containerless and containerful files respectively.
|
||||
aContentType.AssignLiteral(IMAGE_JXL);
|
||||
|
||||
} else {
|
||||
/* none of the above? I give up */
|
||||
return NS_ERROR_NOT_AVAILABLE;
|
||||
|
||||
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 3.2 KiB |
@@ -0,0 +1,3 @@
|
||||
# JXL tests
|
||||
|
||||
pref(image.jxl.enabled,true) == jxl-size-33x33.jxl jxl-size-33x33.png
|
||||
@@ -28,6 +28,9 @@ skip-if(Android) include ico/reftest.list
|
||||
# JPEG tests
|
||||
include jpeg/reftest.list
|
||||
|
||||
# JXL tests
|
||||
skip-if(Android) include jxl/reftest.list
|
||||
|
||||
# GIF tests
|
||||
include gif/reftest.list
|
||||
|
||||
|
||||
@@ -79,6 +79,24 @@ aom_codec_peek_stream_info
|
||||
aom_img_alloc
|
||||
aom_img_free
|
||||
#endif
|
||||
#ifdef MOZ_JXL
|
||||
JxlDecoderCreate
|
||||
JxlDecoderDestroy
|
||||
JxlDecoderSetParallelRunner
|
||||
JxlDecoderSubscribeEvents
|
||||
JxlDecoderProcessInput
|
||||
JxlDecoderSetInput
|
||||
JxlDecoderReleaseInput
|
||||
JxlDecoderGetBasicInfo
|
||||
JxlDecoderImageOutBufferSize
|
||||
JxlDecoderSetImageOutBuffer
|
||||
JxlDecoderGetFrameHeader
|
||||
JxlDecoderFlushImage
|
||||
JxlThreadParallelRunner
|
||||
JxlThreadParallelRunnerCreate
|
||||
JxlThreadParallelRunnerDestroy
|
||||
JxlThreadParallelRunnerDefaultNumWorkerThreads
|
||||
#endif
|
||||
#ifdef MOZ_VORBIS
|
||||
ogg_page_bos
|
||||
ogg_page_granulepos
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
This directory contains build files for the Highway C++
|
||||
SIMD library.
|
||||
|
||||
Any patches or additional configuration to be applied to the
|
||||
upstream source should be kept here in the media/highway
|
||||
directory.
|
||||
|
||||
The upstream highway git repository is:
|
||||
|
||||
https://github.com/google/highway
|
||||
|
||||
The version used was tagged 1.0.2.
|
||||
@@ -0,0 +1,48 @@
|
||||
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
|
||||
# vim: set filetype=python:
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
LOCAL_INCLUDES += [
|
||||
"/media/highway/src/",
|
||||
]
|
||||
|
||||
SOURCES += [
|
||||
"/media/highway/src/hwy/aligned_allocator.cc",
|
||||
"/media/highway/src/hwy/contrib/image/image.cc",
|
||||
"/media/highway/src/hwy/per_target.cc",
|
||||
"/media/highway/src/hwy/targets.cc",
|
||||
]
|
||||
|
||||
EXPORTS.hwy += [
|
||||
"/media/highway/src/hwy/aligned_allocator.h",
|
||||
"/media/highway/src/hwy/base.h",
|
||||
"/media/highway/src/hwy/cache_control.h",
|
||||
"/media/highway/src/hwy/detect_compiler_arch.h",
|
||||
"/media/highway/src/hwy/detect_targets.h",
|
||||
"/media/highway/src/hwy/foreach_target.h",
|
||||
"/media/highway/src/hwy/highway.h",
|
||||
"/media/highway/src/hwy/highway_export.h",
|
||||
"/media/highway/src/hwy/targets.h",
|
||||
]
|
||||
|
||||
EXPORTS.hwy.ops += [
|
||||
"/media/highway/src/hwy/ops/arm_neon-inl.h",
|
||||
"/media/highway/src/hwy/ops/arm_sve-inl.h",
|
||||
"/media/highway/src/hwy/ops/emu128-inl.h",
|
||||
"/media/highway/src/hwy/ops/generic_ops-inl.h",
|
||||
"/media/highway/src/hwy/ops/rvv-inl.h",
|
||||
"/media/highway/src/hwy/ops/scalar-inl.h",
|
||||
"/media/highway/src/hwy/ops/set_macros-inl.h",
|
||||
"/media/highway/src/hwy/ops/shared-inl.h",
|
||||
"/media/highway/src/hwy/ops/wasm_128-inl.h",
|
||||
"/media/highway/src/hwy/ops/x86_128-inl.h",
|
||||
"/media/highway/src/hwy/ops/x86_256-inl.h",
|
||||
"/media/highway/src/hwy/ops/x86_512-inl.h",
|
||||
]
|
||||
|
||||
FINAL_LIBRARY = "gkmedias"
|
||||
|
||||
# We allow warnings for third-party code that can be updated from upstream.
|
||||
ALLOW_COMPILER_WARNINGS = True
|
||||
@@ -0,0 +1,413 @@
|
||||
load("@bazel_skylib//lib:selects.bzl", "selects")
|
||||
|
||||
load("@rules_cc//cc:defs.bzl", "cc_test")
|
||||
package(
|
||||
default_visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
exports_files(["LICENSE"])
|
||||
|
||||
# Detect compiler:
|
||||
config_setting(
|
||||
name = "compiler_clang",
|
||||
flag_values = {"@bazel_tools//tools/cpp:compiler": "clang"},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_clangcl",
|
||||
flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_msvc_actual",
|
||||
flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
|
||||
)
|
||||
|
||||
# The above is insufficient for Bazel on Windows, which does not seem to
|
||||
# detect/set a compiler flag. This workaround prevents compile errors due to
|
||||
# passing clang-only warning flags to MSVC.
|
||||
config_setting(
|
||||
name = "compiler_msvc_cpu",
|
||||
values = {
|
||||
"cpu": "x64_windows",
|
||||
},
|
||||
)
|
||||
|
||||
selects.config_setting_group(
|
||||
name = "compiler_msvc",
|
||||
match_any = [
|
||||
":compiler_msvc_actual",
|
||||
":compiler_msvc_cpu",
|
||||
],
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_emscripten",
|
||||
values = {"cpu": "wasm32"},
|
||||
)
|
||||
|
||||
# See https://github.com/bazelbuild/bazel/issues/12707
|
||||
config_setting(
|
||||
name = "compiler_gcc_bug",
|
||||
flag_values = {
|
||||
"@bazel_tools//tools/cpp:compiler": "compiler",
|
||||
},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_gcc_actual",
|
||||
flag_values = {
|
||||
"@bazel_tools//tools/cpp:compiler": "gcc",
|
||||
},
|
||||
)
|
||||
|
||||
selects.config_setting_group(
|
||||
name = "compiler_gcc",
|
||||
match_any = [
|
||||
":compiler_gcc_bug",
|
||||
":compiler_gcc_actual",
|
||||
],
|
||||
)
|
||||
|
||||
# Additional warnings for Clang OR GCC (skip for MSVC)
|
||||
CLANG_GCC_COPTS = [
|
||||
"-Wunused-parameter",
|
||||
"-Wunused-variable",
|
||||
"-Wextra-semi",
|
||||
"-Wunreachable-code",
|
||||
]
|
||||
|
||||
# Warnings supported by Clang and Clang-cl
|
||||
CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [
|
||||
"-Wfloat-overflow-conversion",
|
||||
"-Wfloat-zero-conversion",
|
||||
"-Wfor-loop-analysis",
|
||||
"-Wgnu-redeclared-enum",
|
||||
"-Winfinite-recursion",
|
||||
"-Wliteral-conversion",
|
||||
"-Wno-c++98-compat",
|
||||
"-Wno-unused-command-line-argument",
|
||||
"-Wprivate-header",
|
||||
"-Wself-assign",
|
||||
"-Wstring-conversion",
|
||||
"-Wtautological-overlap-compare",
|
||||
"-Wthread-safety-analysis",
|
||||
"-Wundefined-func-template",
|
||||
"-Wunused-comparison",
|
||||
]
|
||||
|
||||
# Warnings only supported by Clang, but not Clang-cl
|
||||
CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [
|
||||
# Do not treat the third_party headers as system headers when building
|
||||
# highway - the errors are pertinent.
|
||||
"--no-system-header-prefix=third_party/highway",
|
||||
]
|
||||
|
||||
COPTS = select({
|
||||
":compiler_msvc": [],
|
||||
":compiler_gcc": CLANG_GCC_COPTS,
|
||||
":compiler_clangcl": CLANG_OR_CLANGCL_OPTS,
|
||||
# Default to clang because compiler detection only works in Bazel
|
||||
"//conditions:default": CLANG_ONLY_COPTS,
|
||||
}) + select({
|
||||
"@platforms//cpu:riscv64": [
|
||||
"-march=rv64gcv1p0",
|
||||
"-menable-experimental-extensions",
|
||||
],
|
||||
"//conditions:default": [
|
||||
],
|
||||
})
|
||||
|
||||
DEFINES = select({
|
||||
":compiler_msvc": ["HWY_SHARED_DEFINE"],
|
||||
":compiler_clangcl": ["HWY_SHARED_DEFINE"],
|
||||
"//conditions:default": [],
|
||||
})
|
||||
|
||||
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
|
||||
# usages with an empty list.
|
||||
COMPAT = [
|
||||
"//buildenv/target:non_prod", # includes mobile/vendor.
|
||||
]
|
||||
|
||||
# WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without
|
||||
# failing integration tests, if the machine running tests does not support the
|
||||
# newly enabled instruction set, or the failure is only caught by sanitizers
|
||||
# which do not run in CI.
|
||||
|
||||
cc_library(
|
||||
name = "hwy",
|
||||
srcs = [
|
||||
"hwy/aligned_allocator.cc",
|
||||
"hwy/per_target.cc",
|
||||
"hwy/print.cc",
|
||||
"hwy/targets.cc",
|
||||
],
|
||||
# Normal headers with include guards
|
||||
hdrs = [
|
||||
"hwy/aligned_allocator.h",
|
||||
"hwy/base.h",
|
||||
"hwy/cache_control.h",
|
||||
"hwy/detect_compiler_arch.h", # private
|
||||
"hwy/print.h",
|
||||
],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
defines = DEFINES,
|
||||
local_defines = ["hwy_EXPORTS"],
|
||||
textual_hdrs = [
|
||||
# These are textual because config macros influence them:
|
||||
"hwy/detect_targets.h", # private
|
||||
"hwy/targets.h",
|
||||
# This .cc file #includes itself through foreach_target.h
|
||||
"hwy/per_target.cc",
|
||||
# End of list
|
||||
"hwy/highway.h", # public
|
||||
"hwy/foreach_target.h", # public
|
||||
"hwy/per_target.h", # public
|
||||
"hwy/print-inl.h", # public
|
||||
"hwy/highway_export.h", # public
|
||||
"hwy/ops/arm_neon-inl.h",
|
||||
"hwy/ops/arm_sve-inl.h",
|
||||
"hwy/ops/emu128-inl.h",
|
||||
"hwy/ops/generic_ops-inl.h",
|
||||
"hwy/ops/scalar-inl.h",
|
||||
"hwy/ops/set_macros-inl.h",
|
||||
"hwy/ops/shared-inl.h",
|
||||
"hwy/ops/x86_128-inl.h",
|
||||
"hwy/ops/x86_256-inl.h",
|
||||
"hwy/ops/x86_512-inl.h",
|
||||
# Select avoids recompiling native arch if only non-native changed
|
||||
] + select({
|
||||
":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
|
||||
"//conditions:default": [],
|
||||
}) + select({
|
||||
"@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "algo",
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/algo/copy-inl.h",
|
||||
"hwy/contrib/algo/find-inl.h",
|
||||
"hwy/contrib/algo/transform-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "dot",
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/dot/dot-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "image",
|
||||
srcs = [
|
||||
"hwy/contrib/image/image.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"hwy/contrib/image/image.h",
|
||||
],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_contrib_EXPORTS"],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "math",
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/math/math-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
# Everything required for tests that use Highway.
|
||||
cc_library(
|
||||
name = "hwy_test_util",
|
||||
srcs = ["hwy/tests/test_util.cc"],
|
||||
hdrs = ["hwy/tests/test_util.h"],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_test_EXPORTS"],
|
||||
textual_hdrs = [
|
||||
"hwy/tests/test_util-inl.h",
|
||||
"hwy/tests/hwy_gtest.h",
|
||||
],
|
||||
# Must not depend on a gtest variant, which can conflict with the
|
||||
# GUNIT_INTERNAL_BUILD_MODE defined by the test.
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "nanobenchmark",
|
||||
srcs = ["hwy/nanobenchmark.cc"],
|
||||
hdrs = ["hwy/nanobenchmark.h"],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_EXPORTS"],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "benchmark",
|
||||
srcs = ["hwy/examples/benchmark.cc"],
|
||||
copts = COPTS,
|
||||
deps = [
|
||||
":hwy",
|
||||
":nanobenchmark",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "skeleton",
|
||||
srcs = ["hwy/examples/skeleton.cc"],
|
||||
hdrs = ["hwy/examples/skeleton.h"],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_EXPORTS"],
|
||||
textual_hdrs = ["hwy/examples/skeleton-inl.h"],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "list_targets",
|
||||
srcs = ["hwy/tests/list_targets.cc"],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
# path, name
|
||||
HWY_TESTS = [
|
||||
("hwy/contrib/algo/", "copy_test"),
|
||||
("hwy/contrib/algo/", "find_test"),
|
||||
("hwy/contrib/algo/", "transform_test"),
|
||||
("hwy/contrib/dot/", "dot_test"),
|
||||
("hwy/contrib/image/", "image_test"),
|
||||
("hwy/contrib/math/", "math_test"),
|
||||
# contrib/sort has its own BUILD, we add it to GUITAR_TESTS.
|
||||
("hwy/examples/", "skeleton_test"),
|
||||
("hwy/", "nanobenchmark_test"),
|
||||
("hwy/", "aligned_allocator_test"),
|
||||
("hwy/", "base_test"),
|
||||
("hwy/", "highway_test"),
|
||||
("hwy/", "targets_test"),
|
||||
("hwy/tests/", "arithmetic_test"),
|
||||
("hwy/tests/", "blockwise_test"),
|
||||
("hwy/tests/", "blockwise_shift_test"),
|
||||
("hwy/tests/", "combine_test"),
|
||||
("hwy/tests/", "compare_test"),
|
||||
("hwy/tests/", "compress_test"),
|
||||
("hwy/tests/", "convert_test"),
|
||||
("hwy/tests/", "crypto_test"),
|
||||
("hwy/tests/", "demote_test"),
|
||||
("hwy/tests/", "float_test"),
|
||||
("hwy/tests/", "if_test"),
|
||||
("hwy/tests/", "interleaved_test"),
|
||||
("hwy/tests/", "logical_test"),
|
||||
("hwy/tests/", "mask_test"),
|
||||
("hwy/tests/", "mask_mem_test"),
|
||||
("hwy/tests/", "memory_test"),
|
||||
("hwy/tests/", "mul_test"),
|
||||
("hwy/tests/", "reduction_test"),
|
||||
("hwy/tests/", "reverse_test"),
|
||||
("hwy/tests/", "shift_test"),
|
||||
("hwy/tests/", "swizzle_test"),
|
||||
("hwy/tests/", "test_util_test"),
|
||||
]
|
||||
|
||||
HWY_TEST_COPTS = select({
|
||||
":compiler_msvc": [],
|
||||
"//conditions:default": [
|
||||
# gTest triggers this warning (which is enabled by the
|
||||
# extra-semi in COPTS), so we need to disable it here,
|
||||
# but it's still enabled for :hwy.
|
||||
"-Wno-c++98-compat-extra-semi",
|
||||
],
|
||||
})
|
||||
|
||||
HWY_TEST_DEPS = [
|
||||
":algo",
|
||||
":dot",
|
||||
":hwy",
|
||||
":hwy_test_util",
|
||||
":image",
|
||||
":math",
|
||||
":nanobenchmark",
|
||||
":skeleton",
|
||||
"//hwy/contrib/sort:vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
]
|
||||
|
||||
[
|
||||
[
|
||||
cc_test(
|
||||
name = test,
|
||||
size = "medium",
|
||||
timeout = "long", # default moderate is not enough for math_test
|
||||
srcs = [
|
||||
subdir + test + ".cc",
|
||||
],
|
||||
copts = COPTS + HWY_TEST_COPTS,
|
||||
features = select({
|
||||
"@platforms//cpu:riscv64": ["fully_static_link"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
linkopts = select({
|
||||
":compiler_emscripten": [
|
||||
"-s ASSERTIONS=2",
|
||||
"-s ENVIRONMENT=node,shell,web",
|
||||
"-s ERROR_ON_UNDEFINED_SYMBOLS=1",
|
||||
"-s DEMANGLE_SUPPORT=1",
|
||||
"-s EXIT_RUNTIME=1",
|
||||
"-s ALLOW_MEMORY_GROWTH=1",
|
||||
"--pre-js $(location :preamble.js.lds)",
|
||||
],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
linkstatic = select({
|
||||
"@platforms//cpu:riscv64": True,
|
||||
"//conditions:default": False,
|
||||
}),
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
# for test_suite.
|
||||
tags = ["hwy_ops_test"],
|
||||
deps = HWY_TEST_DEPS + select({
|
||||
":compiler_emscripten": [":preamble.js.lds"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
),
|
||||
]
|
||||
for subdir, test in HWY_TESTS
|
||||
]
|
||||
|
||||
# For manually building the tests we define here (:all does not work in --config=msvc)
|
||||
test_suite(
|
||||
name = "hwy_ops_tests",
|
||||
tags = ["hwy_ops_test"],
|
||||
)
|
||||
|
||||
# Placeholder for integration test, do not remove
|
||||
@@ -0,0 +1,580 @@
|
||||
# Copyright 2019 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
|
||||
if(POLICY CMP0083)
|
||||
cmake_policy(SET CMP0083 NEW)
|
||||
endif()
|
||||
|
||||
# Workaround for 3.19 raising error 'IMPORTED_LOCATION not set for imported
|
||||
# target "GTest::gtest_main"'.
|
||||
if(POLICY CMP0111)
|
||||
cmake_policy(SET CMP0111 OLD)
|
||||
endif()
|
||||
|
||||
project(hwy VERSION 1.0.2) # Keep in sync with highway.h version
|
||||
|
||||
# Directly define the ABI version from the cmake project() version values:
|
||||
set(LIBRARY_VERSION "${hwy_VERSION}")
|
||||
set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
|
||||
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
# Search for Atomics implementation:
|
||||
find_package(Atomics REQUIRED)
|
||||
|
||||
# Enabled PIE binaries by default if supported.
|
||||
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
|
||||
if(CHECK_PIE_SUPPORTED)
|
||||
check_pie_supported(LANGUAGES CXX)
|
||||
if(CMAKE_CXX_LINK_PIE_SUPPORTED)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include(GNUInstallDirs)
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE RelWithDebInfo)
|
||||
endif()
|
||||
|
||||
set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?")
|
||||
|
||||
# Unconditionally adding -Werror risks breaking the build when new warnings
|
||||
# arise due to compiler/platform changes. Enable this in CI/tests.
|
||||
set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
|
||||
|
||||
set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
|
||||
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
|
||||
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
|
||||
set(HWY_ENABLE_TESTS ON CACHE BOOL "Enable HWY tests")
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles(
|
||||
"int main() {
|
||||
#if !defined(__EMSCRIPTEN__)
|
||||
static_assert(false, \"__EMSCRIPTEN__ is not defined\");
|
||||
#endif
|
||||
return 0;
|
||||
}"
|
||||
HWY_EMSCRIPTEN
|
||||
)
|
||||
|
||||
check_cxx_source_compiles(
|
||||
"int main() {
|
||||
#if !defined(__riscv)
|
||||
static_assert(false, \"__riscv is not defined\");
|
||||
#endif
|
||||
return 0;
|
||||
}"
|
||||
HWY_RISCV
|
||||
)
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
# Glob all the traits so we don't need to modify this file when adding
|
||||
# additional special cases.
|
||||
file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
|
||||
list(APPEND HWY_CONTRIB_SOURCES
|
||||
hwy/contrib/dot/dot-inl.h
|
||||
hwy/contrib/image/image.cc
|
||||
hwy/contrib/image/image.h
|
||||
hwy/contrib/math/math-inl.h
|
||||
hwy/contrib/sort/shared-inl.h
|
||||
hwy/contrib/sort/sorting_networks-inl.h
|
||||
hwy/contrib/sort/traits-inl.h
|
||||
hwy/contrib/sort/traits128-inl.h
|
||||
hwy/contrib/sort/vqsort-inl.h
|
||||
hwy/contrib/sort/vqsort.cc
|
||||
hwy/contrib/sort/vqsort.h
|
||||
hwy/contrib/algo/copy-inl.h
|
||||
hwy/contrib/algo/find-inl.h
|
||||
hwy/contrib/algo/transform-inl.h
|
||||
)
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
set(HWY_SOURCES
|
||||
hwy/aligned_allocator.cc
|
||||
hwy/aligned_allocator.h
|
||||
hwy/base.h
|
||||
hwy/cache_control.h
|
||||
hwy/detect_compiler_arch.h # private
|
||||
hwy/detect_targets.h # private
|
||||
hwy/foreach_target.h
|
||||
hwy/highway.h
|
||||
hwy/highway_export.h
|
||||
hwy/nanobenchmark.cc
|
||||
hwy/nanobenchmark.h
|
||||
hwy/ops/arm_neon-inl.h
|
||||
hwy/ops/arm_sve-inl.h
|
||||
hwy/ops/emu128-inl.h
|
||||
hwy/ops/generic_ops-inl.h
|
||||
hwy/ops/rvv-inl.h
|
||||
hwy/ops/scalar-inl.h
|
||||
hwy/ops/set_macros-inl.h
|
||||
hwy/ops/shared-inl.h
|
||||
hwy/ops/wasm_128-inl.h
|
||||
hwy/ops/x86_128-inl.h
|
||||
hwy/ops/x86_256-inl.h
|
||||
hwy/ops/x86_512-inl.h
|
||||
hwy/per_target.cc
|
||||
hwy/per_target.h
|
||||
hwy/print-inl.h
|
||||
hwy/print.cc
|
||||
hwy/print.h
|
||||
hwy/targets.cc
|
||||
hwy/targets.h
|
||||
)
|
||||
|
||||
set(HWY_TEST_SOURCES
|
||||
hwy/tests/hwy_gtest.h
|
||||
hwy/tests/test_util-inl.h
|
||||
hwy/tests/test_util.cc
|
||||
hwy/tests/test_util.h
|
||||
)
|
||||
|
||||
if (MSVC)
|
||||
set(HWY_FLAGS
|
||||
# fix build error C1128 in blockwise*_test & arithmetic_test
|
||||
/bigobj
|
||||
)
|
||||
else()
|
||||
set(HWY_FLAGS
|
||||
# Avoid changing binaries based on the current time and date.
|
||||
-Wno-builtin-macro-redefined
|
||||
-D__DATE__="redacted"
|
||||
-D__TIMESTAMP__="redacted"
|
||||
-D__TIME__="redacted"
|
||||
|
||||
# Optimizations
|
||||
-fmerge-all-constants
|
||||
|
||||
# Warnings
|
||||
-Wall
|
||||
-Wextra
|
||||
# These are not included in Wall nor Wextra:
|
||||
-Wconversion
|
||||
-Wsign-conversion
|
||||
-Wvla
|
||||
-Wnon-virtual-dtor
|
||||
)
|
||||
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wfloat-overflow-conversion
|
||||
-Wfloat-zero-conversion
|
||||
-Wfor-loop-analysis
|
||||
-Wgnu-redeclared-enum
|
||||
-Winfinite-recursion
|
||||
-Wself-assign
|
||||
-Wstring-conversion
|
||||
-Wtautological-overlap-compare
|
||||
-Wthread-safety-analysis
|
||||
-Wundefined-func-template
|
||||
|
||||
-fno-cxx-exceptions
|
||||
-fno-slp-vectorize
|
||||
-fno-vectorize
|
||||
|
||||
# Use color in messages
|
||||
-fdiagnostics-show-option -fcolor-diagnostics
|
||||
)
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0)
|
||||
list(APPEND HWY_FLAGS -Wc++2a-extensions)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wno-global-constructors
|
||||
-Wno-language-extension-token
|
||||
-Wno-used-but-marked-unused
|
||||
-Wno-shadow-field-in-constructor
|
||||
-Wno-unused-member-function
|
||||
-Wno-unused-template
|
||||
-Wno-c++98-compat-pedantic
|
||||
-Wno-used-but-marked-unused
|
||||
-Wno-zero-as-null-pointer-constant
|
||||
)
|
||||
endif()
|
||||
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wno-cast-align
|
||||
-Wno-double-promotion
|
||||
-Wno-float-equal
|
||||
-Wno-format-nonliteral
|
||||
-Wno-shadow
|
||||
-Wno-sign-conversion
|
||||
)
|
||||
else()
|
||||
list(APPEND HWY_FLAGS
|
||||
-fmath-errno
|
||||
-fno-exceptions
|
||||
)
|
||||
endif() # WIN32
|
||||
|
||||
if (HWY_CMAKE_ARM7)
|
||||
list(APPEND HWY_FLAGS
|
||||
-march=armv7-a
|
||||
-mfpu=neon-vfpv4
|
||||
-mfloat-abi=hard # must match the toolchain specified as CXX=
|
||||
-mfp16-format=ieee # required for vcvt_f32_f16
|
||||
)
|
||||
endif() # HWY_CMAKE_ARM7
|
||||
|
||||
if(HWY_RISCV)
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
# Not yet supported by GCC. When runtime dispatch is supported and
|
||||
# implemented, we will remove v from the required flags. Until then, using
|
||||
# clang for RISC-V will require the CPU to support the V extension (1.0).
|
||||
list(APPEND HWY_FLAGS -march=rv64gcv1p0)
|
||||
list(APPEND HWY_FLAGS -menable-experimental-extensions)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (HWY_WARNINGS_ARE_ERRORS)
|
||||
list(APPEND HWY_FLAGS -Werror)
|
||||
endif()
|
||||
|
||||
# Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
|
||||
# because it was not compiled with 'atomics' or 'bulk-memory' features."
|
||||
if (HWY_EMSCRIPTEN)
|
||||
list(APPEND HWY_FLAGS -matomics)
|
||||
endif()
|
||||
|
||||
endif() # !MSVC
|
||||
|
||||
# By default prefer STATIC build (legacy behavior)
|
||||
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
|
||||
option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF)
|
||||
# only expose shared/static options to advanced users:
|
||||
mark_as_advanced(BUILD_SHARED_LIBS)
|
||||
mark_as_advanced(HWY_FORCE_STATIC_LIBS)
|
||||
# Define visibility settings globally:
|
||||
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
|
||||
set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
|
||||
|
||||
# Copy-cat "add_library" logic + add override.
|
||||
set(HWY_LIBRARY_TYPE "SHARED")
|
||||
if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS)
|
||||
set(HWY_LIBRARY_TYPE "STATIC")
|
||||
endif()
|
||||
|
||||
# This preprocessor define will drive the build, also used in the *.pc files:
|
||||
if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED")
|
||||
set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE")
|
||||
else()
|
||||
set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE")
|
||||
endif()
|
||||
|
||||
add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES})
|
||||
target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
|
||||
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
||||
target_compile_features(hwy PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# For GCC __atomic_store_8, see #887
|
||||
target_link_libraries(hwy PRIVATE ${ATOMICS_LIBRARIES})
|
||||
if(UNIX AND NOT APPLE)
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
set_property(TARGET hwy APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
|
||||
# uname -p is broken on this system. Try uname -m
|
||||
EXECUTE_PROCESS( COMMAND uname -m
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE
|
||||
ERROR_QUIET
|
||||
OUTPUT_VARIABLE HWY_ARCH)
|
||||
else (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
|
||||
set(HWY_ARCH ${CMAKE_SYSTEM_PROCESSOR})
|
||||
endif (CMAKE_SYSTEM_PROCESSOR MATCHES "unknown")
|
||||
message(STATUS "Architecture: " ${HWY_ARCH})
|
||||
if (HWY_ARCH MATCHES "mips")
|
||||
target_link_options(hwy PUBLIC "LINKER:-z,noexecstack")
|
||||
endif (HWY_ARCH MATCHES "mips")
|
||||
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
|
||||
target_link_libraries(hwy_contrib hwy)
|
||||
target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy_contrib PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
||||
target_compile_features(hwy_contrib PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy_contrib PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
if(UNIX AND NOT APPLE)
|
||||
set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
|
||||
target_link_libraries(hwy_test hwy)
|
||||
target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy_test PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
||||
target_compile_features(hwy_test PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy_test PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
if(UNIX AND NOT APPLE)
|
||||
set_property(TARGET hwy_test APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
|
||||
# -------------------------------------------------------- hwy_list_targets
|
||||
# Generate a tool to print the compiled-in targets as defined by the current
|
||||
# flags. This tool will print to stderr at build time, after building hwy.
|
||||
add_executable(hwy_list_targets hwy/tests/list_targets.cc)
|
||||
target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
|
||||
target_link_libraries(hwy_list_targets hwy)
|
||||
target_include_directories(hwy_list_targets PRIVATE
|
||||
$<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
|
||||
# TARGET_FILE always returns the path to executable
|
||||
# Naked target also not always could be run (due to the lack of '.\' prefix)
|
||||
# Thus effective command to run should contain the full path
|
||||
# and emulator prefix (if any).
|
||||
if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR)
|
||||
add_custom_command(TARGET hwy_list_targets POST_BUILD
|
||||
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
|
||||
endif()
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Allow skipping the following sections for projects that do not need them:
|
||||
# tests, examples, benchmarks and installation.
|
||||
|
||||
# -------------------------------------------------------- install library
|
||||
if (HWY_ENABLE_INSTALL)
|
||||
|
||||
install(TARGETS hwy
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_SOURCES})
|
||||
if ("${source}" MATCHES "\.h$")
|
||||
get_filename_component(dirname "${source}" DIRECTORY)
|
||||
install(FILES "${source}"
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
install(TARGETS hwy_contrib
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_CONTRIB_SOURCES})
|
||||
if ("${source}" MATCHES "\.h$")
|
||||
get_filename_component(dirname "${source}" DIRECTORY)
|
||||
install(FILES "${source}"
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
|
||||
endif()
|
||||
endforeach()
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
install(TARGETS hwy_test
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_TEST_SOURCES})
|
||||
if ("${source}" MATCHES "\.h$")
|
||||
get_filename_component(dirname "${source}" DIRECTORY)
|
||||
install(FILES "${source}"
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Add a pkg-config file for libhwy and the contrib/test libraries.
|
||||
set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
|
||||
set(HWY_PC_FILES libhwy.pc libhwy-test.pc)
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
list(APPEND HWY_PC_FILES libhwy-contrib.pc)
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
foreach (pc ${HWY_PC_FILES})
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
||||
endforeach()
|
||||
|
||||
endif() # HWY_ENABLE_INSTALL
|
||||
# -------------------------------------------------------- Examples
|
||||
if (HWY_ENABLE_EXAMPLES)
|
||||
|
||||
# Avoids mismatch between GTest's static CRT and our dynamic.
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
|
||||
# Programming exercise with integrated benchmark
|
||||
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
|
||||
target_sources(hwy_benchmark PRIVATE
|
||||
hwy/nanobenchmark.h)
|
||||
# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
|
||||
# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
|
||||
target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
|
||||
target_link_libraries(hwy_benchmark hwy)
|
||||
set_target_properties(hwy_benchmark
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
|
||||
|
||||
endif() # HWY_ENABLE_EXAMPLES
|
||||
# -------------------------------------------------------- Tests
|
||||
|
||||
include(CTest)
|
||||
|
||||
if(BUILD_TESTING AND HWY_ENABLE_TESTS)
|
||||
enable_testing()
|
||||
include(GoogleTest)
|
||||
|
||||
set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?")
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
find_package(GTest REQUIRED)
|
||||
else()
|
||||
# Download and unpack googletest at configure time
|
||||
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
|
||||
RESULT_VARIABLE result
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
|
||||
if(result)
|
||||
message(FATAL_ERROR "CMake step for googletest failed: ${result}")
|
||||
endif()
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} --build .
|
||||
RESULT_VARIABLE result
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
|
||||
if(result)
|
||||
message(FATAL_ERROR "Build step for googletest failed: ${result}")
|
||||
endif()
|
||||
|
||||
# Prevent overriding the parent project's compiler/linker
|
||||
# settings on Windows
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
|
||||
# Add googletest directly to our build. This defines
|
||||
# the gtest and gtest_main targets.
|
||||
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
|
||||
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
|
||||
EXCLUDE_FROM_ALL)
|
||||
endif() # HWY_SYSTEM_GTEST
|
||||
|
||||
set(HWY_TEST_FILES
|
||||
hwy/contrib/algo/copy_test.cc
|
||||
hwy/contrib/algo/find_test.cc
|
||||
hwy/contrib/algo/transform_test.cc
|
||||
hwy/aligned_allocator_test.cc
|
||||
hwy/base_test.cc
|
||||
hwy/highway_test.cc
|
||||
hwy/nanobenchmark_test.cc
|
||||
hwy/targets_test.cc
|
||||
hwy/examples/skeleton_test.cc
|
||||
hwy/tests/arithmetic_test.cc
|
||||
hwy/tests/blockwise_test.cc
|
||||
hwy/tests/blockwise_shift_test.cc
|
||||
hwy/tests/combine_test.cc
|
||||
hwy/tests/compare_test.cc
|
||||
hwy/tests/compress_test.cc
|
||||
hwy/tests/convert_test.cc
|
||||
hwy/tests/crypto_test.cc
|
||||
hwy/tests/demote_test.cc
|
||||
hwy/tests/float_test.cc
|
||||
hwy/tests/if_test.cc
|
||||
hwy/tests/interleaved_test.cc
|
||||
hwy/tests/logical_test.cc
|
||||
hwy/tests/mask_test.cc
|
||||
hwy/tests/mask_mem_test.cc
|
||||
hwy/tests/memory_test.cc
|
||||
hwy/tests/mul_test.cc
|
||||
hwy/tests/reduction_test.cc
|
||||
hwy/tests/reverse_test.cc
|
||||
hwy/tests/shift_test.cc
|
||||
hwy/tests/swizzle_test.cc
|
||||
hwy/tests/test_util_test.cc
|
||||
)
|
||||
|
||||
set(HWY_TEST_LIBS hwy hwy_test)
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
list(APPEND HWY_TEST_LIBS hwy_contrib)
|
||||
|
||||
list(APPEND HWY_TEST_FILES
|
||||
hwy/contrib/dot/dot_test.cc
|
||||
hwy/contrib/image/image_test.cc
|
||||
# Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
|
||||
# not reproducible locally. Still tested via bazel build.
|
||||
# hwy/contrib/math/math_test.cc
|
||||
hwy/contrib/sort/sort_test.cc
|
||||
)
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
if (CMAKE_VERSION VERSION_LESS 3.20)
|
||||
set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
|
||||
else()
|
||||
set(HWY_GTEST_LIBS GTest::gtest GTest::gtest_main)
|
||||
endif()
|
||||
else()
|
||||
set(HWY_GTEST_LIBS gtest gtest_main)
|
||||
endif()
|
||||
|
||||
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
|
||||
foreach (TESTFILE IN LISTS HWY_TEST_FILES)
|
||||
# The TESTNAME is the name without the extension or directory.
|
||||
get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
|
||||
add_executable(${TESTNAME} ${TESTFILE})
|
||||
target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
|
||||
# Test all targets, not just the best/baseline. This changes the default
|
||||
# policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
|
||||
# cause compile errors because only one may be set, and other CMakeLists.txt
|
||||
# that include us may set them.
|
||||
target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
|
||||
|
||||
target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
|
||||
# Output test targets in the test directory.
|
||||
set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
|
||||
|
||||
if (HWY_EMSCRIPTEN)
|
||||
set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
|
||||
endif()
|
||||
|
||||
if(${CMAKE_VERSION} VERSION_LESS "3.10.3")
|
||||
gtest_discover_tests(${TESTNAME} TIMEOUT 60)
|
||||
else ()
|
||||
gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60)
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
# The skeleton test uses the skeleton library code.
|
||||
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
|
||||
|
||||
endif() # BUILD_TESTING
|
||||
@@ -0,0 +1,15 @@
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(googletest-download NONE)
|
||||
|
||||
include(ExternalProject)
|
||||
ExternalProject_Add(googletest
|
||||
GIT_REPOSITORY https://github.com/google/googletest.git
|
||||
GIT_TAG 43efa0a4efd40c78b9210d15373112081899a97c
|
||||
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
|
||||
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
||||
@@ -0,0 +1,33 @@
|
||||
# How to Contribute
|
||||
|
||||
We'd love to accept your patches and contributions to this project. There are
|
||||
just a few small guidelines you need to follow.
|
||||
|
||||
## Contributor License Agreement
|
||||
|
||||
Contributions to this project must be accompanied by a Contributor License
|
||||
Agreement. You (or your employer) retain the copyright to your contribution;
|
||||
this simply gives us permission to use and redistribute your contributions as
|
||||
part of the project. Head over to <https://cla.developers.google.com/> to see
|
||||
your current agreements on file or to sign a new one.
|
||||
|
||||
You generally only need to submit a CLA once, so if you've already submitted one
|
||||
(even if it was for a different project), you probably don't need to do it
|
||||
again.
|
||||
|
||||
## Code reviews
|
||||
|
||||
All submissions, including submissions by project members, require review. We
|
||||
use GitHub pull requests for this purpose. Consult
|
||||
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
||||
information on using pull requests.
|
||||
|
||||
## Testing
|
||||
|
||||
This repository is used by JPEG XL, so major API changes will require
|
||||
coordination. Please get in touch with us beforehand, e.g. by raising an issue.
|
||||
|
||||
## Community Guidelines
|
||||
|
||||
This project follows
|
||||
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
|
||||
@@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@@ -0,0 +1,322 @@
|
||||
# Efficient and performance-portable vector software
|
||||
|
||||
[//]: # (placeholder, do not remove)
|
||||
|
||||
Highway is a C++ library that provides portable SIMD/vector intrinsics.
|
||||
|
||||
## Why
|
||||
|
||||
We are passionate about high-performance software. We see major untapped
|
||||
potential in CPUs (servers, mobile, desktops). Highway is for engineers who want
|
||||
to reliably and economically push the boundaries of what is possible in
|
||||
software.
|
||||
|
||||
## How
|
||||
|
||||
CPUs provide SIMD/vector instructions that apply the same operation to multiple
|
||||
data items. This can reduce energy usage e.g. *fivefold* because fewer
|
||||
instructions are executed. We also often see *5-10x* speedups.
|
||||
|
||||
Highway makes SIMD/vector programming practical and workable according to these
|
||||
guiding principles:
|
||||
|
||||
**Does what you expect**: Highway is a C++ library with carefully-chosen
|
||||
functions that map well to CPU instructions without extensive compiler
|
||||
transformations. The resulting code is more predictable and robust to code
|
||||
changes/compiler updates than autovectorization.
|
||||
|
||||
**Works on widely-used platforms**: Highway supports four architectures; the
|
||||
same application code can target eight instruction sets, including those with
|
||||
'scalable' vectors (size unknown at compile time). Highway only requires C++11
|
||||
and supports four families of compilers. If you would like to use Highway on
|
||||
other platforms, please raise an issue.
|
||||
|
||||
**Flexible to deploy**: Applications using Highway can run on heterogeneous
|
||||
clouds or client devices, choosing the best available instruction set at
|
||||
runtime. Alternatively, developers may choose to target a single instruction set
|
||||
without any runtime overhead. In both cases, the application code is the same
|
||||
except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
|
||||
line of code.
|
||||
|
||||
**Suitable for a variety of domains**: Highway provides an extensive set of
|
||||
operations, used for image processing (floating-point), compression, video
|
||||
analysis, linear algebra, cryptography, sorting and random generation. We
|
||||
recognise that new use-cases may require additional ops and are happy to add
|
||||
them where it makes sense (e.g. no performance cliffs on some architectures). If
|
||||
you would like to discuss, please file an issue.
|
||||
|
||||
**Rewards data-parallel design**: Highway provides tools such as Gather,
|
||||
MaskedLoad, and FixedTag to enable speedups for legacy data structures. However,
|
||||
the biggest gains are unlocked by designing algorithms and data structures for
|
||||
scalable vectors. Helpful techniques include batching, structure-of-array
|
||||
layouts, and aligned/padded allocations.
|
||||
|
||||
## Examples
|
||||
|
||||
Online demos using Compiler Explorer:
|
||||
|
||||
- [multiple targets with dynamic dispatch](https://gcc.godbolt.org/z/zP7MYe9Yf)
|
||||
(recommended)
|
||||
- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
|
||||
|
||||
Projects using Highway: (to add yours, feel free to raise an issue or contact us
|
||||
via the below email)
|
||||
|
||||
* [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
|
||||
* [JPEG XL image codec](https://github.com/libjxl/libjxl)
|
||||
* [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
|
||||
* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982))
|
||||
|
||||
## Current status
|
||||
|
||||
### Targets
|
||||
|
||||
Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
|
||||
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
|
||||
WASM SIMD, RISC-V V.
|
||||
|
||||
SVE was initially tested using farm_sve (see acknowledgments).
|
||||
|
||||
### Versioning
|
||||
|
||||
Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH),
|
||||
incrementing MINOR after backward-compatible additions and PATCH after
|
||||
backward-compatible fixes. We recommend using releases (rather than the Git tip)
|
||||
because they are tested more extensively, see below.
|
||||
|
||||
The current version 1.0 signals an increased focus on backwards compatibility.
|
||||
Applications using documented functionality will remain compatible with future
|
||||
updates that have the same major version number.
|
||||
|
||||
### Testing
|
||||
|
||||
Continuous integration tests build with a recent version of Clang (running on
|
||||
native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
|
||||
x86).
|
||||
|
||||
Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
|
||||
cross-compile. See the [testing process](g3doc/release_testing_process.md) for
|
||||
details.
|
||||
|
||||
### Related modules
|
||||
|
||||
The `contrib` directory contains SIMD-related utilities: an image class with
|
||||
aligned rows, a math library (16 functions already implemented, mostly
|
||||
trigonometry), and functions for computing dot products and sorting.
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses CMake to generate and build. In a Debian-based system you can
|
||||
install it via:
|
||||
|
||||
```bash
|
||||
sudo apt install cmake
|
||||
```
|
||||
|
||||
Highway's unit tests use [googletest](https://github.com/google/googletest).
|
||||
By default, Highway's CMake downloads this dependency at configuration time.
|
||||
You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and
|
||||
installing gtest separately:
|
||||
|
||||
```bash
|
||||
sudo apt install libgtest-dev
|
||||
```
|
||||
|
||||
To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS),
|
||||
the standard CMake workflow can be used:
|
||||
|
||||
```bash
|
||||
mkdir -p build && cd build
|
||||
cmake ..
|
||||
make -j && make test
|
||||
```
|
||||
|
||||
Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
|
||||
|
||||
Bazel is also supported for building, but it is not as widely used/tested.
|
||||
|
||||
## Quick start
|
||||
|
||||
You can use the `benchmark` inside examples/ as a starting point.
|
||||
|
||||
A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
|
||||
and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
|
||||
indicates the number of instructions per operation.
|
||||
|
||||
The [FAQ](g3doc/faq.md) answers questions about portability, API design and
|
||||
where to find more information.
|
||||
|
||||
We recommend using full SIMD vectors whenever possible for maximum performance
|
||||
portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
|
||||
`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
|
||||
alternatives for use-cases requiring an upper bound on the lanes:
|
||||
|
||||
- For up to `N` lanes, specify `CappedTag<T, N>` or the equivalent
|
||||
`HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to
|
||||
the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is
|
||||
useful for data structures such as a narrow matrix. A loop is still required
|
||||
because vectors may actually have fewer than `N` lanes.
|
||||
|
||||
- For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
|
||||
supported `N` depends on the target, but is guaranteed to be at least
|
||||
`16/sizeof(T)`.
|
||||
|
||||
Due to ADL restrictions, user code calling Highway ops must either:
|
||||
* Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or
|
||||
* prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE;
|
||||
hn::Add()`; or
|
||||
* add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
|
||||
|
||||
Additionally, each function that calls Highway ops (such as `Load`) must either
|
||||
be prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
|
||||
`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
|
||||
their opening brace.
|
||||
|
||||
The entry points into code using Highway differ slightly depending on whether
|
||||
they use static or dynamic dispatch.
|
||||
|
||||
* For static dispatch, `HWY_TARGET` will be the best available target among
|
||||
`HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
|
||||
[quick-reference](g3doc/quick_reference.md)). Functions inside
|
||||
`HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within
|
||||
the same module they are defined in. You can call the function from other
|
||||
modules by wrapping it in a regular function and declaring the regular
|
||||
function in a header.
|
||||
|
||||
* For dynamic dispatch, a table of function pointers is generated via the
|
||||
`HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
|
||||
call the best function pointer for the current CPU's supported targets. A
|
||||
module is automatically compiled for each target in `HWY_TARGETS` (see
|
||||
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
|
||||
defined and `foreach_target.h` is included.
|
||||
|
||||
When using dynamic dispatch, `foreach_target.h` is included from translation
|
||||
units (.cc files), not headers. Headers containing vector code shared between
|
||||
several translation units require a special include guard, for example the
|
||||
following taken from `examples/skeleton-inl.h`:
|
||||
|
||||
```
|
||||
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
// Your vector code
|
||||
#endif
|
||||
```
|
||||
|
||||
By convention, we name such headers `-inl.h` because their contents (often
|
||||
function templates) are usually inlined.
|
||||
|
||||
## Compiler flags
|
||||
|
||||
Applications should be compiled with optimizations enabled - without inlining,
|
||||
SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is
|
||||
generally sufficient.
|
||||
|
||||
For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to
|
||||
pass vector arguments in registers. If intending to use the AVX2 target together
|
||||
with half-width vectors (e.g. for `PromoteTo`), it is also important to compile
|
||||
with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4
|
||||
instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and
|
||||
non-VEX SSE4 may cause severe performance degradation. Unfortunately, the
|
||||
resulting binary will then require AVX2. Note that no such flag is needed for
|
||||
clang and GCC because they support target-specific attributes, which we use to
|
||||
ensure proper VEX code generation for AVX2 targets.
|
||||
|
||||
## Strip-mining loops
|
||||
|
||||
To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
|
||||
loop with number of iterations matching the preferred vector width.
|
||||
|
||||
In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count`
|
||||
the number of elements to process, and `N = Lanes(d)` the number of lanes in a
|
||||
full vector. Assume the loop body is given as a function `template<bool partial,
|
||||
class D> void LoopBody(D d, size_t index, size_t max_n)`.
|
||||
|
||||
Highway offers several ways to express loops where `N` need not divide `count`:
|
||||
|
||||
* Ensure all inputs/outputs are padded. Then the loop is simply
|
||||
|
||||
```
|
||||
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0);
|
||||
```
|
||||
Here, the template parameter and second function argument are not needed.
|
||||
|
||||
This is the preferred option, unless `N` is in the thousands and vector
|
||||
operations are pipelined with long latencies. This was the case for
|
||||
supercomputers in the 90s, but nowadays ALUs are cheap and we see most
|
||||
implementations split vectors into 1, 2 or 4 parts, so there is little cost
|
||||
to processing entire vectors even if we do not need all their lanes. Indeed
|
||||
this avoids the (potentially large) cost of predication or partial
|
||||
loads/stores on older targets, and does not duplicate code.
|
||||
|
||||
* Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This
|
||||
takes care of the loop and remainder handling and you simply define a
|
||||
generic lambda function (C++14) or functor which receives the current vector
|
||||
from the input/output array, plus optionally vectors from up to two extra
|
||||
input arrays, and returns the value to write to the input/output array.
|
||||
|
||||
Here is an example implementing the BLAS function SAXPY (`alpha * x + y`):
|
||||
|
||||
```
|
||||
Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR {
|
||||
return MulAdd(Set(d, alpha), v, v1);
|
||||
});
|
||||
```
|
||||
|
||||
* Process whole vectors as above, followed by a scalar loop:
|
||||
|
||||
```
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
|
||||
for (; i < count; ++i) LoopBody<false>(CappedTag<T, 1>(), i, 0);
|
||||
```
|
||||
The template parameter and second function arguments are again not needed.
|
||||
|
||||
This avoids duplicating code, and is reasonable if `count` is large.
|
||||
If `count` is small, the second loop may be slower than the next option.
|
||||
|
||||
* Process whole vectors as above, followed by a single call to a modified
|
||||
`LoopBody` with masking:
|
||||
|
||||
```
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
LoopBody<false>(d, i, 0);
|
||||
}
|
||||
if (i < count) {
|
||||
LoopBody<true>(d, i, count - i);
|
||||
}
|
||||
```
|
||||
Now the template parameter and third function argument can be used inside
|
||||
`LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v`
|
||||
with the previous contents of memory at subsequent locations:
|
||||
`BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly,
|
||||
`MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first
|
||||
`num_remaining` elements and returns zero in other lanes.
|
||||
|
||||
This is a good default when it is infeasible to ensure vectors are padded,
|
||||
but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`!
|
||||
In contrast to the scalar loop, only a single final iteration is needed.
|
||||
The increased code size from two loop bodies is expected to be worthwhile
|
||||
because it avoids the cost of masking in all but the final iteration.
|
||||
|
||||
## Additional resources
|
||||
|
||||
* [Highway introduction (slides)](g3doc/highway_intro.pdf)
|
||||
* [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
|
||||
* [Design philosophy and comparison](g3doc/design_philosophy.md)
|
||||
* [Implementation details](g3doc/impl_details.md)
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger
|
||||
Bramas; it has proved useful for checking the SVE port on an x86 development
|
||||
machine.
|
||||
|
||||
This is not an officially supported Google product.
|
||||
Contact: janwas@google.com
|
||||
@@ -0,0 +1,24 @@
|
||||
workspace(name = "highway")
|
||||
|
||||
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
|
||||
|
||||
http_archive(
|
||||
name = "com_google_googletest",
|
||||
urls = ["https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip"],
|
||||
sha256 = "5cf189eb6847b4f8fc603a3ffff3b0771c08eec7dd4bd961bfd45477dd13eb73",
|
||||
strip_prefix = "googletest-609281088cfefc76f9d0ce82e1ff6c30cc3591e5",
|
||||
)
|
||||
|
||||
# See https://google.github.io/googletest/quickstart-bazel.html
|
||||
http_archive(
|
||||
name = "rules_cc",
|
||||
urls = ["https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.zip"],
|
||||
sha256 = "56ac9633c13d74cb71e0546f103ce1c58810e4a76aa8325da593ca4277908d72",
|
||||
strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556",
|
||||
)
|
||||
|
||||
# Need recent version for config_setting_group
|
||||
http_archive(
|
||||
name = "bazel_skylib",
|
||||
urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz"],
|
||||
)
|
||||
@@ -0,0 +1,157 @@
|
||||
highway (1.0.2-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add ExclusiveNeither, FindKnownFirstTrue, Ne128
|
||||
* Add 16-bit SumOfLanes/ReorderWidenMulAccumulate/ReorderDemote2To
|
||||
* Faster sort for low-entropy input, improved pivot selection
|
||||
* Add GN build system, Highway FAQ, k32v32 type to vqsort
|
||||
* CMake: Support find_package(GTest), add rvv-inl.h, add HWY_ENABLE_TESTS
|
||||
* Fix MIPS and C++20 build, Apple LLVM 10.3 detection, EMU128 AllTrue on RVV
|
||||
* Fix missing exec_prefix, RVV build, warnings, libatomic linking
|
||||
* Work around GCC 10.4 issue, disabled RDCYCLE, arm7 with vfpv3
|
||||
* Documentation/example improvements
|
||||
* Support static dispatch to SVE2_128 and SVE_256
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 27 Oct 2022 17:00:00 +0200
|
||||
|
||||
highway (1.0.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add Eq128, i64 Mul, unsigned->float ConvertTo
|
||||
* Faster sort for few unique keys, more robust pivot selection
|
||||
* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16
|
||||
* Fix: avoid always_inline in debug, link atomic
|
||||
* GCC warnings: string.h, maybe-uninitialized, ignored-attributes
|
||||
* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow
|
||||
* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Tue, 23 Aug 2022 10:00:00 +0200
|
||||
|
||||
highway (1.0.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* ABI change: 64-bit target values, more room for expansion
|
||||
* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo
|
||||
* Add HWY_SVE2_128 target
|
||||
* Sort speedups especially for 128-bit
|
||||
* Documentation clarifications
|
||||
* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue
|
||||
* Improved SVE codegen
|
||||
* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt
|
||||
* MSVC 2017 workarounds
|
||||
* Support for runtime dispatch on Arm/GCC/Linux
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Wed, 27 Jul 2022 10:00:00 +0200
|
||||
|
||||
highway (0.17.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN
|
||||
* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN
|
||||
* Add MulFixedPoint15, Or3
|
||||
* Add Copy[If], Find[If], Generate, Replace[If] algos
|
||||
* Add HWY_EMU128 target (replaces HWY_SCALAR)
|
||||
* HWY_RVV is feature-complete
|
||||
* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros
|
||||
* Extend ConcatOdd/Even and StoreInterleaved* to all types
|
||||
* Allow CappedTag<T, nonPowerOfTwo>
|
||||
* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc
|
||||
* Expand documentation
|
||||
* Fix RDTSCP crash in nanobenchmark
|
||||
* Fix XCR0 check (was ignoring AVX3 on ICL)
|
||||
* Support Arm/RISC-V timers
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Fri, 20 May 2022 10:00:00 +0200
|
||||
|
||||
highway (0.16.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add contrib/sort (vectorized quicksort)
|
||||
* Add IfNegativeThenElse, IfVecThenElse
|
||||
* Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound
|
||||
* Add OrAnd, Min128, Max128, Lt128, SumsOf8
|
||||
* Support capped/partial vectors on RVV/SVE, int64 in WASM
|
||||
* Support SVE2, shared library build
|
||||
* Remove deprecated overloads without the required d arg (UpperHalf etc.)
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 03 Feb 2022 11:00:00 +0100
|
||||
|
||||
highway (0.15.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
|
||||
* New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight
|
||||
* Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes
|
||||
* Contrib: add sort(ing network) and dot(product)
|
||||
* Targets: update RVV for LLVM, add experimental WASM2
|
||||
* Separate library hwy_test for test utils
|
||||
* Add non-macro Simd<> aliases
|
||||
* Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Wed, 10 Nov 2021 10:00:00 +0100
|
||||
|
||||
highway (0.14.2-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add MaskedLoad
|
||||
* Fix non-glibc PPC, Windows GCC, MSVC 19.14
|
||||
* Opt-in for -Werror; separate design_philosophy.md
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Tue, 24 Aug 2021 15:00:00 +0200
|
||||
|
||||
highway (0.14.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add LoadMaskBits, CompressBits[Store]
|
||||
* Fix CPU feature check (AES/F16C) and warnings
|
||||
* Improved DASSERT - disabled in optimized builds
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Tue, 17 Aug 2021 14:00:00 +0200
|
||||
|
||||
highway (0.14.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add SVE, S-SSE3, AVX3_DL targets
|
||||
* Support partial vectors in all ops
|
||||
* Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0
|
||||
* Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 29 Jul 2021 15:00:00 +0200
|
||||
|
||||
highway (0.12.2-1) UNRELEASED; urgency=medium
|
||||
|
||||
* fix scalar-only test and Windows macro conflict with Load/StoreFence
|
||||
* replace deprecated wasm intrinsics
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Mon, 31 May 2021 16:00:00 +0200
|
||||
|
||||
highway (0.12.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
|
||||
* fix warnings, faster ARM div/sqrt, separate hwy_contrib library
|
||||
* add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Wed, 19 May 2021 15:00:00 +0200
|
||||
|
||||
highway (0.12.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
|
||||
* Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
|
||||
* Proper IEEE rounding, reduce libstdc++ usage, inlined math
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 15 Apr 2021 20:00:00 +0200
|
||||
|
||||
highway (0.11.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Fix clang7 asan error, finish f16 conversions and add test
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 25 Feb 2021 16:00:00 +0200
|
||||
|
||||
highway (0.11.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 18 Feb 2021 20:00:00 +0200
|
||||
|
||||
highway (0.7.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Added API stability notice, Compress[Store], contrib/, SignBit, CopySign
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Tue, 5 Jan 2021 17:00:00 +0200
|
||||
|
||||
highway (0.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Initial debian package.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Mon, 19 Oct 2020 16:48:07 +0200
|
||||
@@ -0,0 +1 @@
|
||||
10
|
||||
@@ -0,0 +1,23 @@
|
||||
Source: highway
|
||||
Maintainer: JPEG XL Maintainers <jpegxl@google.com>
|
||||
Section: misc
|
||||
Priority: optional
|
||||
Standards-Version: 3.9.8
|
||||
Build-Depends: cmake,
|
||||
debhelper (>= 9),
|
||||
libgtest-dev
|
||||
Homepage: https://github.com/google/highway
|
||||
|
||||
Package: libhwy-dev
|
||||
Architecture: any
|
||||
Section: libdevel
|
||||
Depends: ${misc:Depends}
|
||||
Description: Efficient and performance-portable SIMD wrapper (developer files)
|
||||
This library provides type-safe and source-code portable wrappers over
|
||||
existing platform-specific intrinsics. Its design aims for simplicity,
|
||||
reliable efficiency across platforms, and immediate usability with current
|
||||
compilers.
|
||||
.
|
||||
This package installs the development files. There's no runtime library
|
||||
since most of Highway is implemented in headers and only a very small
|
||||
static library is needed.
|
||||
@@ -0,0 +1,20 @@
|
||||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: highway
|
||||
|
||||
Files: *
|
||||
Copyright: 2020 Google LLC
|
||||
License: Apache-2.0
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
.
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
.
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
.
|
||||
On Debian systems, the complete text of the Apache License, Version 2
|
||||
can be found in "/usr/share/common-licenses/Apache-2.0".
|
||||
@@ -0,0 +1,6 @@
|
||||
#!/usr/bin/make -f
|
||||
%:
|
||||
dh $@ --buildsystem=cmake
|
||||
|
||||
override_dh_auto_configure:
|
||||
dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON
|
||||
@@ -0,0 +1 @@
|
||||
3.0 (quilt)
|
||||
@@ -0,0 +1,152 @@
|
||||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h> // malloc
|
||||
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
|
||||
#if HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
// Not actually an upper bound on the size, but this value prevents crossing a
|
||||
// 4K boundary (relevant on Andes).
|
||||
constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096);
|
||||
#else
|
||||
constexpr size_t kAlignment = HWY_ALIGNMENT;
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
|
||||
// if this is used for single-vector allocations. 256 is more reasonable.
|
||||
constexpr size_t kAlias = kAlignment * 4;
|
||||
#else
|
||||
constexpr size_t kAlias = kAlignment;
|
||||
#endif
|
||||
|
||||
#pragma pack(push, 1)
|
||||
struct AllocationHeader {
|
||||
void* allocated;
|
||||
size_t payload_size;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
// Returns a 'random' (cyclical) offset for AllocateAlignedBytes.
|
||||
size_t NextAlignedOffset() {
|
||||
static std::atomic<uint32_t> next{0};
|
||||
constexpr uint32_t kGroups = kAlias / kAlignment;
|
||||
const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
|
||||
const size_t offset = kAlignment * group;
|
||||
HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias);
|
||||
return offset;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size,
|
||||
AllocPtr alloc_ptr, void* opaque_ptr) {
|
||||
HWY_ASSERT(payload_size != 0); // likely a bug in caller
|
||||
if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
|
||||
HWY_DASSERT(false && "payload_size too large");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t offset = NextAlignedOffset();
|
||||
|
||||
// What: | misalign | unused | AllocationHeader |payload
|
||||
// Size: |<= kAlias | offset |payload_size
|
||||
// ^allocated.^aligned.^header............^payload
|
||||
// The header must immediately precede payload, which must remain aligned.
|
||||
// To avoid wasting space, the header resides at the end of `unused`,
|
||||
// which therefore cannot be empty (offset == 0).
|
||||
if (offset == 0) {
|
||||
offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
|
||||
static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
|
||||
}
|
||||
|
||||
const size_t allocated_size = kAlias + offset + payload_size;
|
||||
void* allocated;
|
||||
if (alloc_ptr == nullptr) {
|
||||
allocated = malloc(allocated_size);
|
||||
} else {
|
||||
allocated = (*alloc_ptr)(opaque_ptr, allocated_size);
|
||||
}
|
||||
if (allocated == nullptr) return nullptr;
|
||||
// Always round up even if already aligned - we already asked for kAlias
|
||||
// extra bytes and there's no way to give them back.
|
||||
uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
|
||||
static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
|
||||
static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
|
||||
aligned &= ~(kAlias - 1);
|
||||
|
||||
const uintptr_t payload = aligned + offset; // still aligned
|
||||
|
||||
// Stash `allocated` and payload_size inside header for FreeAlignedBytes().
|
||||
// The allocated_size can be reconstructed from the payload_size.
|
||||
AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
|
||||
header->allocated = allocated;
|
||||
header->payload_size = payload_size;
|
||||
|
||||
return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
|
||||
FreePtr free_ptr, void* opaque_ptr) {
|
||||
if (aligned_pointer == nullptr) return;
|
||||
|
||||
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
|
||||
HWY_DASSERT(payload % kAlignment == 0);
|
||||
const AllocationHeader* header =
|
||||
reinterpret_cast<const AllocationHeader*>(payload) - 1;
|
||||
|
||||
if (free_ptr == nullptr) {
|
||||
free(header->allocated);
|
||||
} else {
|
||||
(*free_ptr)(opaque_ptr, header->allocated);
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer,
|
||||
FreePtr free_ptr,
|
||||
void* opaque_ptr,
|
||||
ArrayDeleter deleter) {
|
||||
if (aligned_pointer == nullptr) return;
|
||||
|
||||
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
|
||||
HWY_DASSERT(payload % kAlignment == 0);
|
||||
const AllocationHeader* header =
|
||||
reinterpret_cast<const AllocationHeader*>(payload) - 1;
|
||||
|
||||
if (deleter) {
|
||||
(*deleter)(aligned_pointer, header->payload_size);
|
||||
}
|
||||
|
||||
if (free_ptr == nullptr) {
|
||||
free(header->allocated);
|
||||
} else {
|
||||
(*free_ptr)(opaque_ptr, header->allocated);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
@@ -0,0 +1,212 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
||||
#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
||||
|
||||
// Memory allocator with support for alignment and offsets.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
|
||||
// requires a literal. This matches typical L1 cache line sizes, which prevents
|
||||
// false sharing.
|
||||
#define HWY_ALIGNMENT 64
|
||||
|
||||
// Pointers to functions equivalent to malloc/free with an opaque void* passed
|
||||
// to them.
|
||||
using AllocPtr = void* (*)(void* opaque, size_t bytes);
|
||||
using FreePtr = void (*)(void* opaque, void* memory);
|
||||
|
||||
// Returns null or a pointer to at least `payload_size` (which can be zero)
|
||||
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
|
||||
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
|
||||
// memory or malloc() if it is null.
|
||||
HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
|
||||
AllocPtr alloc_ptr, void* opaque_ptr);
|
||||
|
||||
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
|
||||
// must have been returned from a previous call to `AllocateAlignedBytes`.
|
||||
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
|
||||
// `free_ptr` function is null, uses the default free().
|
||||
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
|
||||
FreePtr free_ptr, void* opaque_ptr);
|
||||
|
||||
// Class that deletes the aligned pointer passed to operator() calling the
|
||||
// destructor before freeing the pointer. This is equivalent to the
|
||||
// std::default_delete but for aligned objects. For a similar deleter equivalent
|
||||
// to free() for aligned memory see AlignedFreer().
|
||||
class AlignedDeleter {
|
||||
public:
|
||||
AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
|
||||
AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
|
||||
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
|
||||
|
||||
template <typename T>
|
||||
void operator()(T* aligned_pointer) const {
|
||||
return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
|
||||
TypedArrayDeleter<T>);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
|
||||
size_t elems = size_in_bytes / sizeof(T);
|
||||
for (size_t i = 0; i < elems; i++) {
|
||||
// Explicitly call the destructor on each element.
|
||||
(static_cast<T*>(ptr) + i)->~T();
|
||||
}
|
||||
}
|
||||
|
||||
// Function prototype that calls the destructor for each element in a typed
|
||||
// array. TypeArrayDeleter<T> would match this prototype.
|
||||
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
|
||||
|
||||
HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
|
||||
FreePtr free_ptr,
|
||||
void* opaque_ptr,
|
||||
ArrayDeleter deleter);
|
||||
|
||||
FreePtr free_;
|
||||
void* opaque_ptr_;
|
||||
};
|
||||
|
||||
// Unique pointer to T with custom aligned deleter. This can be a single
|
||||
// element U or an array of element if T is a U[]. The custom aligned deleter
|
||||
// will call the destructor on U or each element of a U[] in the array case.
|
||||
template <typename T>
|
||||
using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
|
||||
|
||||
// Aligned memory equivalent of make_unique<T> using the custom allocators
|
||||
// alloc/free with the passed `opaque` pointer. This function calls the
|
||||
// constructor with the passed Args... and calls the destructor of the object
|
||||
// when the AlignedUniquePtr is destroyed.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
|
||||
void* opaque, Args&&... args) {
|
||||
T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
|
||||
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
|
||||
AlignedDeleter(free, opaque));
|
||||
}
|
||||
|
||||
// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
|
||||
// functions.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
|
||||
T* ptr = static_cast<T*>(AllocateAlignedBytes(
|
||||
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
|
||||
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
|
||||
AlignedDeleter());
|
||||
}
|
||||
|
||||
// Helpers for array allocators (avoids overflow)
|
||||
namespace detail {
|
||||
|
||||
// Returns x such that 1u << x == n (if n is a power of two).
|
||||
static inline constexpr size_t ShiftCount(size_t n) {
|
||||
return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
|
||||
constexpr size_t size = sizeof(T);
|
||||
|
||||
constexpr bool is_pow2 = (size & (size - 1)) == 0;
|
||||
constexpr size_t bits = ShiftCount(size);
|
||||
static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
|
||||
|
||||
const size_t bytes = is_pow2 ? items << bits : items * size;
|
||||
const size_t check = is_pow2 ? bytes >> bits : bytes / size;
|
||||
if (check != items) {
|
||||
return nullptr; // overflowed
|
||||
}
|
||||
return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Aligned memory equivalent of make_unique<T[]> for array types using the
|
||||
// custom allocators alloc/free. This function calls the constructor with the
|
||||
// passed Args... on every created item. The destructor of each element will be
|
||||
// called when the AlignedUniquePtr is destroyed.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
|
||||
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
|
||||
T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
|
||||
if (ptr != nullptr) {
|
||||
for (size_t i = 0; i < items; i++) {
|
||||
new (ptr + i) T(std::forward<Args>(args)...);
|
||||
}
|
||||
}
|
||||
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
|
||||
return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
|
||||
items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
|
||||
// but for aligned memory.
|
||||
class AlignedFreer {
|
||||
public:
|
||||
// Pass address of this to ctor to skip deleting externally-owned memory.
|
||||
static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
|
||||
|
||||
AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
|
||||
AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
|
||||
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
|
||||
|
||||
template <typename T>
|
||||
void operator()(T* aligned_pointer) const {
|
||||
// TODO(deymo): assert that we are using a POD type T.
|
||||
FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
|
||||
}
|
||||
|
||||
private:
|
||||
FreePtr free_;
|
||||
void* opaque_ptr_;
|
||||
};
|
||||
|
||||
// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
|
||||
// data use AlignedUniquePtr.
|
||||
template <typename T>
|
||||
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
|
||||
|
||||
// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
|
||||
// Upon destruction of the unique_ptr the aligned array will be freed.
|
||||
template <typename T>
|
||||
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
|
||||
FreePtr free, void* opaque) {
|
||||
return AlignedFreeUniquePtr<T[]>(
|
||||
detail::AllocateAlignedItems<T>(items, alloc, opaque),
|
||||
AlignedFreer(free, opaque));
|
||||
}
|
||||
|
||||
// Same as previous AllocateAligned(), using default allocate/free functions.
|
||||
template <typename T>
|
||||
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
|
||||
return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
||||
@@ -0,0 +1,278 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <array>
|
||||
#include <new>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// Sample object that keeps track on an external counter of how many times was
|
||||
// the explicit constructor and destructor called.
|
||||
template <size_t N>
|
||||
class SampleObject {
|
||||
public:
|
||||
SampleObject() { data_[0] = 'a'; }
|
||||
explicit SampleObject(int* counter) : counter_(counter) {
|
||||
if (counter) (*counter)++;
|
||||
data_[0] = 'b';
|
||||
}
|
||||
|
||||
~SampleObject() {
|
||||
if (counter_) (*counter_)--;
|
||||
}
|
||||
|
||||
static_assert(N > sizeof(int*), "SampleObject size too small.");
|
||||
int* counter_ = nullptr;
|
||||
char data_[N - sizeof(int*)];
|
||||
};
|
||||
|
||||
class FakeAllocator {
|
||||
public:
|
||||
// static AllocPtr and FreePtr member to be used with the alligned
|
||||
// allocator. These functions calls the private non-static members.
|
||||
static void* StaticAlloc(void* opaque, size_t bytes) {
|
||||
return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
|
||||
}
|
||||
static void StaticFree(void* opaque, void* memory) {
|
||||
return reinterpret_cast<FakeAllocator*>(opaque)->Free(memory);
|
||||
}
|
||||
|
||||
// Returns the number of pending allocations to be freed.
|
||||
size_t PendingAllocs() { return allocs_.size(); }
|
||||
|
||||
private:
|
||||
void* Alloc(size_t bytes) {
|
||||
void* ret = malloc(bytes);
|
||||
allocs_.insert(ret);
|
||||
return ret;
|
||||
}
|
||||
void Free(void* memory) {
|
||||
if (!memory) return;
|
||||
EXPECT_NE(allocs_.end(), allocs_.find(memory));
|
||||
allocs_.erase(memory);
|
||||
free(memory);
|
||||
}
|
||||
|
||||
std::set<void*> allocs_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hwy {
|
||||
|
||||
class AlignedAllocatorTest : public testing::Test {};
|
||||
|
||||
TEST(AlignedAllocatorTest, FreeNullptr) {
|
||||
// Calling free with a nullptr is always ok.
|
||||
FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr,
|
||||
/*opaque_ptr=*/nullptr);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, Log2) {
|
||||
EXPECT_EQ(0u, detail::ShiftCount(1));
|
||||
EXPECT_EQ(1u, detail::ShiftCount(2));
|
||||
EXPECT_EQ(3u, detail::ShiftCount(8));
|
||||
}
|
||||
|
||||
// Allocator returns null when it detects overflow of items * sizeof(T).
|
||||
TEST(AlignedAllocatorTest, Overflow) {
|
||||
constexpr size_t max = ~size_t(0);
|
||||
constexpr size_t msb = (max >> 1) + 1;
|
||||
using Size5 = std::array<uint8_t, 5>;
|
||||
using Size10 = std::array<uint8_t, 10>;
|
||||
EXPECT_EQ(nullptr,
|
||||
detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
|
||||
EXPECT_EQ(nullptr,
|
||||
detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
|
||||
EXPECT_EQ(nullptr,
|
||||
detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
|
||||
EXPECT_EQ(nullptr,
|
||||
detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
|
||||
EXPECT_EQ(nullptr,
|
||||
detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
|
||||
EXPECT_EQ(nullptr,
|
||||
detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocDefaultPointers) {
|
||||
const size_t kSize = 7777;
|
||||
void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
|
||||
/*opaque_ptr=*/nullptr);
|
||||
ASSERT_NE(nullptr, ptr);
|
||||
// Make sure the pointer is actually aligned.
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
|
||||
char* p = static_cast<char*>(ptr);
|
||||
size_t ret = 0;
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
// Performs a computation using p[] to prevent it being optimized away.
|
||||
p[i] = static_cast<char>(i & 0x7F);
|
||||
if (i) ret += static_cast<size_t>(p[i] * p[i - 1]);
|
||||
}
|
||||
EXPECT_NE(0U, ret);
|
||||
FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) {
|
||||
AlignedUniquePtr<SampleObject<32>> ptr(nullptr, AlignedDeleter());
|
||||
AlignedUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedDeleter());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) {
|
||||
AlignedFreeUniquePtr<SampleObject<32>> ptr(nullptr, AlignedFreer());
|
||||
AlignedFreeUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedFreer());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, CustomAlloc) {
|
||||
FakeAllocator fake_alloc;
|
||||
|
||||
const size_t kSize = 7777;
|
||||
void* ptr =
|
||||
AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
|
||||
ASSERT_NE(nullptr, ptr);
|
||||
// We should have only requested one alloc from the allocator.
|
||||
EXPECT_EQ(1U, fake_alloc.PendingAllocs());
|
||||
// Make sure the pointer is actually aligned.
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
|
||||
FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
|
||||
EXPECT_EQ(0U, fake_alloc.PendingAllocs());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
|
||||
{
|
||||
auto ptr = MakeUniqueAligned<SampleObject<24>>();
|
||||
// Default constructor sets the data_[0] to 'a'.
|
||||
EXPECT_EQ('a', ptr->data_[0]);
|
||||
EXPECT_EQ(nullptr, ptr->counter_);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAligned) {
|
||||
int counter = 0;
|
||||
{
|
||||
// Creates the object, initializes it with the explicit constructor and
|
||||
// returns an unique_ptr to it.
|
||||
auto ptr = MakeUniqueAligned<SampleObject<24>>(&counter);
|
||||
EXPECT_EQ(1, counter);
|
||||
// Custom constructor sets the data_[0] to 'b'.
|
||||
EXPECT_EQ('b', ptr->data_[0]);
|
||||
}
|
||||
EXPECT_EQ(0, counter);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
|
||||
int counter = 0;
|
||||
{
|
||||
// Creates the array of objects and initializes them with the explicit
|
||||
// constructor.
|
||||
auto arr = MakeUniqueAlignedArray<SampleObject<24>>(7, &counter);
|
||||
EXPECT_EQ(7, counter);
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
// Custom constructor sets the data_[0] to 'b'.
|
||||
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(0, counter);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocSingleInt) {
|
||||
auto ptr = AllocateAligned<uint32_t>(1);
|
||||
ASSERT_NE(nullptr, ptr.get());
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
|
||||
// Force delete of the unique_ptr now to check that it doesn't crash.
|
||||
ptr.reset(nullptr);
|
||||
EXPECT_EQ(nullptr, ptr.get());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocMultipleInt) {
|
||||
const size_t kSize = 7777;
|
||||
auto ptr = AllocateAligned<uint32_t>(kSize);
|
||||
ASSERT_NE(nullptr, ptr.get());
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
|
||||
// ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
|
||||
// underlying type chosen by AllocateAligned() for the std::unique_ptr.
|
||||
EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
|
||||
|
||||
size_t ret = 0;
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
// Performs a computation using ptr[] to prevent it being optimized away.
|
||||
ptr[i] = static_cast<uint32_t>(i);
|
||||
if (i) ret += ptr[i] * ptr[i - 1];
|
||||
}
|
||||
EXPECT_NE(0U, ret);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
|
||||
int counter = 0;
|
||||
{
|
||||
// This doesn't call the constructor.
|
||||
auto obj = AllocateAligned<SampleObject<24>>(1);
|
||||
obj[0].counter_ = &counter;
|
||||
}
|
||||
// Destroying the unique_ptr shouldn't have called the destructor of the
|
||||
// SampleObject<24>.
|
||||
EXPECT_EQ(0, counter);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) {
|
||||
FakeAllocator fake_alloc;
|
||||
int counter = 0;
|
||||
{
|
||||
// Creates the array of objects and initializes them with the explicit
|
||||
// constructor.
|
||||
auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
|
||||
7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
|
||||
&counter);
|
||||
ASSERT_NE(nullptr, arr.get());
|
||||
// An array should still only call a single allocation.
|
||||
EXPECT_EQ(1u, fake_alloc.PendingAllocs());
|
||||
EXPECT_EQ(7, counter);
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
// Custom constructor sets the data_[0] to 'b'.
|
||||
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(0, counter);
|
||||
EXPECT_EQ(0u, fake_alloc.PendingAllocs());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, DefaultInit) {
|
||||
// The test is whether this compiles. Default-init is useful for output params
|
||||
// and per-thread storage.
|
||||
std::vector<AlignedUniquePtr<int[]>> ptrs;
|
||||
std::vector<AlignedFreeUniquePtr<double[]>> free_ptrs;
|
||||
ptrs.resize(128);
|
||||
free_ptrs.resize(128);
|
||||
// The following is to prevent elision of the pointers.
|
||||
std::mt19937 rng(129); // Emscripten lacks random_device.
|
||||
std::uniform_int_distribution<size_t> dist(0, 127);
|
||||
ptrs[dist(rng)] = MakeUniqueAlignedArray<int>(123);
|
||||
free_ptrs[dist(rng)] = AllocateAligned<double>(456);
|
||||
// "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64.
|
||||
const auto addr1 = reinterpret_cast<uintptr_t>(ptrs[dist(rng)].get());
|
||||
const auto addr2 = reinterpret_cast<uintptr_t>(free_ptrs[dist(rng)].get());
|
||||
constexpr size_t kBits = sizeof(uintptr_t) * 8;
|
||||
EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1),
|
||||
(addr2 >> (kBits - 1)) >> (kBits - 1));
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
@@ -0,0 +1,946 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_BASE_H_
|
||||
#define HIGHWAY_HWY_BASE_H_
|
||||
|
||||
// For SIMD module implementations and their callers, target-independent.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
#include <string.h> // memcpy
|
||||
#endif
|
||||
#if HWY_ARCH_X86
|
||||
#include <atomic>
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler-specific definitions
|
||||
|
||||
#define HWY_STR_IMPL(macro) #macro
|
||||
#define HWY_STR(macro) HWY_STR_IMPL(macro)
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
|
||||
#include <intrin.h>
|
||||
|
||||
#define HWY_RESTRICT __restrict
|
||||
#define HWY_INLINE __forceinline
|
||||
#define HWY_NOINLINE __declspec(noinline)
|
||||
#define HWY_FLATTEN
|
||||
#define HWY_NORETURN __declspec(noreturn)
|
||||
#define HWY_LIKELY(expr) (expr)
|
||||
#define HWY_UNLIKELY(expr) (expr)
|
||||
#define HWY_PRAGMA(tokens) __pragma(tokens)
|
||||
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
|
||||
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
|
||||
#define HWY_MAYBE_UNUSED
|
||||
#define HWY_HAS_ASSUME_ALIGNED 0
|
||||
#if (_MSC_VER >= 1700)
|
||||
#define HWY_MUST_USE_RESULT _Check_return_
|
||||
#else
|
||||
#define HWY_MUST_USE_RESULT
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define HWY_RESTRICT __restrict__
|
||||
// force inlining without optimization enabled creates very inefficient code
|
||||
// that can cause compiler timeout
|
||||
#ifdef __OPTIMIZE__
|
||||
#define HWY_INLINE inline __attribute__((always_inline))
|
||||
#else
|
||||
#define HWY_INLINE inline
|
||||
#endif
|
||||
#define HWY_NOINLINE __attribute__((noinline))
|
||||
#define HWY_FLATTEN __attribute__((flatten))
|
||||
#define HWY_NORETURN __attribute__((noreturn))
|
||||
#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
|
||||
#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
|
||||
#define HWY_PRAGMA(tokens) _Pragma(#tokens)
|
||||
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
|
||||
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
|
||||
// Encountered "attribute list cannot appear here" when using the C++17
|
||||
// [[maybe_unused]], so only use the old style attribute for now.
|
||||
#define HWY_MAYBE_UNUSED __attribute__((unused))
|
||||
#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Builtin/attributes
|
||||
|
||||
// Enables error-checking of format strings.
|
||||
#if HWY_HAS_ATTRIBUTE(__format__)
|
||||
#define HWY_FORMAT(idx_fmt, idx_arg) \
|
||||
__attribute__((__format__(__printf__, idx_fmt, idx_arg)))
|
||||
#else
|
||||
#define HWY_FORMAT(idx_fmt, idx_arg)
|
||||
#endif
|
||||
|
||||
// Returns a void* pointer which the compiler then assumes is N-byte aligned.
|
||||
// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
|
||||
//
|
||||
// The assignment semantics are required by GCC/Clang. ICC provides an in-place
|
||||
// __assume_aligned, whereas MSVC's __assume appears unsuitable.
|
||||
#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
|
||||
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
|
||||
#else
|
||||
#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
|
||||
#endif
|
||||
|
||||
// Clang and GCC require attributes on each function into which SIMD intrinsics
|
||||
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
|
||||
// automatic annotation via pragmas.
|
||||
#if HWY_COMPILER_CLANG
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str) \
|
||||
HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
|
||||
apply_to = function))
|
||||
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
|
||||
#elif HWY_COMPILER_GCC
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str) \
|
||||
HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
|
||||
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
|
||||
#else
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str)
|
||||
#define HWY_POP_ATTRIBUTES
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Macros
|
||||
|
||||
#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
|
||||
|
||||
#define HWY_CONCAT_IMPL(a, b) a##b
|
||||
#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
|
||||
|
||||
#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#if HWY_COMPILER_GCC_ACTUAL
|
||||
// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
|
||||
#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
|
||||
#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
|
||||
#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
|
||||
#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
|
||||
#define HWY_DEFAULT_UNROLL HWY_UNROLL()
|
||||
#else
|
||||
#define HWY_UNROLL(factor)
|
||||
#define HWY_DEFAULT_UNROLL
|
||||
#endif
|
||||
|
||||
|
||||
// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
|
||||
// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
|
||||
// does, without generating code.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
|
||||
#else
|
||||
// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
|
||||
#define HWY_FENCE
|
||||
#endif
|
||||
|
||||
// 4 instances of a given literal value, useful as input to LoadDup128.
|
||||
#define HWY_REP4(literal) literal, literal, literal, literal
|
||||
|
||||
#define HWY_ABORT(format, ...) \
|
||||
::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||
|
||||
// Always enabled.
|
||||
#define HWY_ASSERT(condition) \
|
||||
do { \
|
||||
if (!(condition)) { \
|
||||
HWY_ABORT("Assert %s", #condition); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
|
||||
#define HWY_IS_MSAN 1
|
||||
#else
|
||||
#define HWY_IS_MSAN 0
|
||||
#endif
|
||||
|
||||
#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
|
||||
#define HWY_IS_ASAN 1
|
||||
#else
|
||||
#define HWY_IS_ASAN 0
|
||||
#endif
|
||||
|
||||
#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
|
||||
#define HWY_IS_TSAN 1
|
||||
#else
|
||||
#define HWY_IS_TSAN 0
|
||||
#endif
|
||||
|
||||
// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
|
||||
// You can disable MSAN by adding this attribute to the function that fails.
|
||||
#if HWY_IS_MSAN
|
||||
#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
|
||||
#else
|
||||
#define HWY_ATTR_NO_MSAN
|
||||
#endif
|
||||
|
||||
// For enabling HWY_DASSERT and shortening tests in slower debug builds
|
||||
#if !defined(HWY_IS_DEBUG_BUILD)
|
||||
// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
|
||||
// MSVC defines NDEBUG (if not, could instead check _DEBUG).
|
||||
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
|
||||
HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
|
||||
#define HWY_IS_DEBUG_BUILD 1
|
||||
#else
|
||||
#define HWY_IS_DEBUG_BUILD 0
|
||||
#endif
|
||||
#endif // HWY_IS_DEBUG_BUILD
|
||||
|
||||
#if HWY_IS_DEBUG_BUILD
|
||||
#define HWY_DASSERT(condition) HWY_ASSERT(condition)
|
||||
#else
|
||||
#define HWY_DASSERT(condition) \
|
||||
do { \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// kMaxVectorSize (undocumented, pending removal)
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
|
||||
#elif HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
// Not actually an upper bound on the size.
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
|
||||
#else
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Alignment
|
||||
|
||||
// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
|
||||
// should be allocated dynamically via aligned_allocator.h because Lanes() may
|
||||
// exceed the stack size.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ALIGN_MAX alignas(64)
|
||||
#elif HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
|
||||
#else
|
||||
#define HWY_ALIGN_MAX alignas(16)
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Lane types
|
||||
|
||||
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
||||
// by concatenating base type and bits.
|
||||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
|
||||
// always supported on aarch64, for v7 only if -mfp16-format is given.
|
||||
#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
|
||||
using float16_t = __fp16;
|
||||
// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
|
||||
// Required for Clang RVV if the float16 extension is used.
|
||||
#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
|
||||
using float16_t = _Float16;
|
||||
// Otherwise emulate
|
||||
#else
|
||||
struct float16_t {
|
||||
uint16_t bits;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct bfloat16_t {
|
||||
uint16_t bits;
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
using float32_t = float;
|
||||
using float64_t = double;
|
||||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
|
||||
// https://reviews.llvm.org/D86310
|
||||
struct alignas(16) uint128_t {
|
||||
uint64_t lo; // little-endian layout
|
||||
uint64_t hi;
|
||||
};
|
||||
|
||||
// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
|
||||
// field is to be compared (Lt128Upper instead of Lt128).
|
||||
struct alignas(16) K64V64 {
|
||||
uint64_t value; // little-endian layout
|
||||
uint64_t key;
|
||||
};
|
||||
|
||||
// 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier
|
||||
// than when considering both to be a 64-bit key.
|
||||
struct alignas(8) K32V32 {
|
||||
uint32_t value; // little-endian layout
|
||||
uint32_t key;
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
|
||||
const uint128_t& b) {
|
||||
return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
|
||||
}
|
||||
// Required for std::greater.
|
||||
static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
|
||||
const uint128_t& b) {
|
||||
return b < a;
|
||||
}
|
||||
static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a,
|
||||
const uint128_t& b) {
|
||||
return a.lo == b.lo && a.hi == b.hi;
|
||||
}
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
|
||||
const K64V64& b) {
|
||||
return a.key < b.key;
|
||||
}
|
||||
// Required for std::greater.
|
||||
static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
|
||||
const K64V64& b) {
|
||||
return b < a;
|
||||
}
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a,
|
||||
const K32V32& b) {
|
||||
return a.key < b.key;
|
||||
}
|
||||
// Required for std::greater.
|
||||
static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a,
|
||||
const K32V32& b) {
|
||||
return b < a;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Controlling overload resolution (SFINAE)
|
||||
|
||||
template <bool Condition>
|
||||
struct EnableIfT {};
|
||||
template <>
|
||||
struct EnableIfT<true> {
|
||||
using type = void;
|
||||
};
|
||||
|
||||
template <bool Condition>
|
||||
using EnableIf = typename EnableIfT<Condition>::type;
|
||||
|
||||
template <typename T, typename U>
|
||||
struct IsSameT {
|
||||
enum { value = 0 };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct IsSameT<T, T> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template <typename T, typename U>
|
||||
HWY_API constexpr bool IsSame() {
|
||||
return IsSameT<T, U>::value;
|
||||
}
|
||||
|
||||
// Insert into template/function arguments to enable this overload only for
|
||||
// vectors of AT MOST this many bits.
|
||||
//
|
||||
// Note that enabling for exactly 128 bits is unnecessary because a function can
|
||||
// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
|
||||
// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
|
||||
#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
|
||||
#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
|
||||
#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
|
||||
#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
|
||||
#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
|
||||
#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
|
||||
#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
|
||||
|
||||
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
|
||||
#define HWY_IF_SIGNED(T) \
|
||||
hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
|
||||
#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
|
||||
#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
|
||||
|
||||
#define HWY_IF_LANE_SIZE(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
|
||||
#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
|
||||
#define HWY_IF_LANE_SIZE_LT(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
|
||||
|
||||
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
|
||||
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
|
||||
|
||||
// Empty struct used as a size tag type.
|
||||
template <size_t N>
|
||||
struct SizeTag {};
|
||||
|
||||
template <class T>
|
||||
struct RemoveConstT {
|
||||
using type = T;
|
||||
};
|
||||
template <class T>
|
||||
struct RemoveConstT<const T> {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using RemoveConst = typename RemoveConstT<T>::type;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type relations
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
struct Relations;
|
||||
template <>
|
||||
struct Relations<uint8_t> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = uint16_t;
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int8_t> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = int16_t;
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = uint32_t;
|
||||
using Narrow = uint8_t;
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = int32_t;
|
||||
using Narrow = int8_t;
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint32_t> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
using Wide = uint64_t;
|
||||
using Narrow = uint16_t;
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int32_t> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
using Wide = int64_t;
|
||||
using Narrow = int16_t;
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint64_t> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Wide = uint128_t;
|
||||
using Narrow = uint32_t;
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int64_t> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = int32_t;
|
||||
enum { is_signed = 1, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint128_t> {
|
||||
using Unsigned = uint128_t;
|
||||
using Narrow = uint64_t;
|
||||
enum { is_signed = 0, is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<float16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Float = float16_t;
|
||||
using Wide = float;
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<bfloat16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = float;
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<float> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
using Wide = double;
|
||||
using Narrow = float16_t;
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<double> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = float;
|
||||
enum { is_signed = 1, is_float = 1 };
|
||||
};
|
||||
|
||||
template <size_t N>
|
||||
struct TypeFromSize;
|
||||
template <>
|
||||
struct TypeFromSize<1> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<2> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<4> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<8> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<16> {
|
||||
using Unsigned = uint128_t;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Aliases for types of a different category, but the same size.
|
||||
template <typename T>
|
||||
using MakeUnsigned = typename detail::Relations<T>::Unsigned;
|
||||
template <typename T>
|
||||
using MakeSigned = typename detail::Relations<T>::Signed;
|
||||
template <typename T>
|
||||
using MakeFloat = typename detail::Relations<T>::Float;
|
||||
|
||||
// Aliases for types of the same category, but different size.
|
||||
template <typename T>
|
||||
using MakeWide = typename detail::Relations<T>::Wide;
|
||||
template <typename T>
|
||||
using MakeNarrow = typename detail::Relations<T>::Narrow;
|
||||
|
||||
// Obtain type from its size [bytes].
|
||||
template <size_t N>
|
||||
using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
|
||||
template <size_t N>
|
||||
using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
|
||||
template <size_t N>
|
||||
using FloatFromSize = typename detail::TypeFromSize<N>::Float;
|
||||
|
||||
// Avoid confusion with SizeTag where the parameter is a lane size.
|
||||
using UnsignedTag = SizeTag<0>;
|
||||
using SignedTag = SizeTag<0x100>; // integer
|
||||
using FloatTag = SizeTag<0x200>;
|
||||
|
||||
template <typename T, class R = detail::Relations<T>>
|
||||
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
|
||||
return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
|
||||
}
|
||||
|
||||
// For when we only want to distinguish FloatTag from everything else.
|
||||
using NonFloatTag = SizeTag<0x400>;
|
||||
|
||||
template <typename T, class R = detail::Relations<T>>
|
||||
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
|
||||
return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type traits
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr bool IsFloat() {
|
||||
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
|
||||
// from a float, not compared.
|
||||
return IsSame<T, float>() || IsSame<T, double>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr bool IsSigned() {
|
||||
return T(0) > T(-1);
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<float16_t>() {
|
||||
return true;
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<bfloat16_t>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Largest/smallest representable integer values.
|
||||
template <typename T>
|
||||
HWY_API constexpr T LimitsMax() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
using TU = MakeUnsigned<T>;
|
||||
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
|
||||
: static_cast<TU>(~0ull));
|
||||
}
|
||||
template <typename T>
|
||||
HWY_API constexpr T LimitsMin() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
|
||||
}
|
||||
|
||||
// Largest/smallest representable value (integer or float). This naming avoids
|
||||
// confusion with numeric_limits<float>::min() (the smallest positive value).
|
||||
template <typename T>
|
||||
HWY_API constexpr T LowestValue() {
|
||||
return LimitsMin<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float LowestValue<float>() {
|
||||
return -3.402823466e+38F;
|
||||
}
|
||||
template <>
|
||||
constexpr double LowestValue<double>() {
|
||||
return -1.7976931348623158e+308;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr T HighestValue() {
|
||||
return LimitsMax<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float HighestValue<float>() {
|
||||
return 3.402823466e+38F;
|
||||
}
|
||||
template <>
|
||||
constexpr double HighestValue<double>() {
|
||||
return 1.7976931348623158e+308;
|
||||
}
|
||||
|
||||
// Difference between 1.0 and the next representable value.
|
||||
template <typename T>
|
||||
HWY_API constexpr T Epsilon() {
|
||||
return 1;
|
||||
}
|
||||
template <>
|
||||
constexpr float Epsilon<float>() {
|
||||
return 1.192092896e-7f;
|
||||
}
|
||||
template <>
|
||||
constexpr double Epsilon<double>() {
|
||||
return 2.2204460492503131e-16;
|
||||
}
|
||||
|
||||
// Returns width in bits of the mantissa field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr int MantissaBits() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr int MantissaBits<float>() {
|
||||
return 23;
|
||||
}
|
||||
template <>
|
||||
constexpr int MantissaBits<double>() {
|
||||
return 52;
|
||||
}
|
||||
|
||||
// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
|
||||
// the largest possible (biased) exponent field. Used by IsInf.
|
||||
template <typename T>
|
||||
constexpr MakeSigned<T> MaxExponentTimes2() {
|
||||
return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
|
||||
}
|
||||
|
||||
// Returns bitmask of the sign bit in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr MakeUnsigned<T> SignMask() {
|
||||
return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
|
||||
}
|
||||
|
||||
// Returns bitmask of the exponent field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr MakeUnsigned<T> ExponentMask() {
|
||||
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
|
||||
}
|
||||
|
||||
// Returns bitmask of the mantissa field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr MakeUnsigned<T> MantissaMask() {
|
||||
return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
|
||||
}
|
||||
|
||||
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
|
||||
// absolute value are less than this can be represented exactly.
|
||||
template <typename T>
|
||||
constexpr T MantissaEnd() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr float MantissaEnd<float>() {
|
||||
return 8388608.0f; // 1 << 23
|
||||
}
|
||||
template <>
|
||||
constexpr double MantissaEnd<double>() {
|
||||
// floating point literal with p52 requires C++17.
|
||||
return 4503599627370496.0; // 1 << 52
|
||||
}
|
||||
|
||||
// Returns width in bits of the exponent field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr int ExponentBits() {
|
||||
// Exponent := remaining bits after deducting sign and mantissa.
|
||||
return 8 * sizeof(T) - 1 - MantissaBits<T>();
|
||||
}
|
||||
|
||||
// Returns largest value of the biased exponent field in IEEE binary32/64,
|
||||
// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
|
||||
// This is expressed as a signed integer for more efficient comparison.
|
||||
template <typename T>
|
||||
constexpr MakeSigned<T> MaxExponentField() {
|
||||
return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Helper functions
|
||||
|
||||
template <typename T1, typename T2>
|
||||
constexpr inline T1 DivCeil(T1 a, T2 b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
// Works for any `align`; if a power of two, compiler emits ADD+AND.
|
||||
constexpr inline size_t RoundUpTo(size_t what, size_t align) {
|
||||
return DivCeil(what, align) * align;
|
||||
}
|
||||
|
||||
// Undefined results for x == 0.
|
||||
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanForward(&index, x);
|
||||
return index;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_ctz(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
#if HWY_ARCH_X86_64
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanForward64(&index, x);
|
||||
return index;
|
||||
#else // HWY_ARCH_X86_64
|
||||
// _BitScanForward64 not available
|
||||
uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
|
||||
unsigned long index; // NOLINT
|
||||
if (lsb == 0) {
|
||||
uint32_t msb = static_cast<uint32_t>(x >> 32u);
|
||||
_BitScanForward(&index, msb);
|
||||
return 32 + index;
|
||||
} else {
|
||||
_BitScanForward(&index, lsb);
|
||||
return index;
|
||||
}
|
||||
#endif // HWY_ARCH_X86_64
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_ctzll(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
// Undefined results for x == 0.
|
||||
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanReverse(&index, x);
|
||||
return 31 - index;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_clz(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
#if HWY_ARCH_X86_64
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanReverse64(&index, x);
|
||||
return 63 - index;
|
||||
#else // HWY_ARCH_X86_64
|
||||
// _BitScanReverse64 not available
|
||||
const uint32_t msb = static_cast<uint32_t>(x >> 32u);
|
||||
unsigned long index; // NOLINT
|
||||
if (msb == 0) {
|
||||
const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
|
||||
_BitScanReverse(&index, lsb);
|
||||
return 63 - index;
|
||||
} else {
|
||||
_BitScanReverse(&index, msb);
|
||||
return 31 - index;
|
||||
}
|
||||
#endif // HWY_ARCH_X86_64
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_clzll(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_API size_t PopCount(uint64_t x) {
|
||||
#if HWY_COMPILER_GCC // includes clang
|
||||
return static_cast<size_t>(__builtin_popcountll(x));
|
||||
// This instruction has a separate feature flag, but is often called from
|
||||
// non-SIMD code, so we don't want to require dynamic dispatch. It was first
|
||||
// supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
|
||||
// for AVX, so check for that.
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
|
||||
return _mm_popcnt_u64(x);
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
|
||||
return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
|
||||
_mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
|
||||
#else
|
||||
x -= ((x >> 1) & 0x5555555555555555ULL);
|
||||
x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
|
||||
x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
|
||||
x += (x >> 8);
|
||||
x += (x >> 16);
|
||||
x += (x >> 32);
|
||||
return static_cast<size_t>(x & 0x7Fu);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Skip HWY_API due to GCC "function not considered for inlining". Previously
|
||||
// such errors were caused by underlying type mismatches, but it's not clear
|
||||
// what is still mismatched despite all the casts.
|
||||
template <typename TI>
|
||||
/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
|
||||
return x == TI{1}
|
||||
? 0
|
||||
: static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
|
||||
}
|
||||
|
||||
template <typename TI>
|
||||
/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
|
||||
return x == TI{1}
|
||||
? 0
|
||||
: static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
|
||||
}
|
||||
|
||||
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
||||
#pragma intrinsic(_umul128)
|
||||
#endif
|
||||
|
||||
// 64 x 64 = 128 bit multiplication
|
||||
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
|
||||
#if defined(__SIZEOF_INT128__)
|
||||
__uint128_t product = (__uint128_t)a * (__uint128_t)b;
|
||||
*upper = (uint64_t)(product >> 64);
|
||||
return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
||||
return _umul128(a, b, upper);
|
||||
#else
|
||||
constexpr uint64_t kLo32 = 0xFFFFFFFFU;
|
||||
const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
|
||||
const uint64_t hi_lo = (a >> 32) * (b & kLo32);
|
||||
const uint64_t lo_hi = (a & kLo32) * (b >> 32);
|
||||
const uint64_t hi_hi = (a >> 32) * (b >> 32);
|
||||
const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
|
||||
*upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
|
||||
return (t << 32) | (lo_lo & kLo32);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
#pragma intrinsic(memcpy)
|
||||
#pragma intrinsic(memset)
|
||||
#endif
|
||||
|
||||
// The source/destination must not overlap/alias.
|
||||
template <size_t kBytes, typename From, typename To>
|
||||
HWY_API void CopyBytes(const From* from, To* to) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
memcpy(to, from, kBytes);
|
||||
#else
|
||||
__builtin_memcpy(
|
||||
static_cast<void*>(to), static_cast<const void*>(from), kBytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Same as CopyBytes, but for same-sized objects; avoids a size argument.
|
||||
template <typename From, typename To>
|
||||
HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
|
||||
static_assert(sizeof(From) == sizeof(To), "");
|
||||
CopyBytes<sizeof(From)>(from, to);
|
||||
}
|
||||
|
||||
template <size_t kBytes, typename To>
|
||||
HWY_API void ZeroBytes(To* to) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
memset(to, 0, kBytes);
|
||||
#else
|
||||
__builtin_memset(to, 0, kBytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_API float F32FromBF16(bfloat16_t bf) {
|
||||
uint32_t bits = bf.bits;
|
||||
bits <<= 16;
|
||||
float f;
|
||||
CopySameSize(&bits, &f);
|
||||
return f;
|
||||
}
|
||||
|
||||
HWY_API bfloat16_t BF16FromF32(float f) {
|
||||
uint32_t bits;
|
||||
CopySameSize(&f, &bits);
|
||||
bfloat16_t bf;
|
||||
bf.bits = static_cast<uint16_t>(bits >> 16);
|
||||
return bf;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...);
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_BASE_H_
|
||||
@@ -0,0 +1,178 @@
|
||||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "base_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
HWY_NOINLINE void TestAllLimits() {
|
||||
HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
|
||||
HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
|
||||
HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
|
||||
LimitsMin<int64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
|
||||
HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
|
||||
HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
|
||||
}
|
||||
|
||||
struct TestLowestHighest {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
HWY_ASSERT_EQ(std::numeric_limits<T>::lowest(), LowestValue<T>());
|
||||
HWY_ASSERT_EQ(std::numeric_limits<T>::max(), HighestValue<T>());
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); }
|
||||
struct TestIsUnsigned {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(!IsFloat<T>(), "Expected !IsFloat");
|
||||
static_assert(!IsSigned<T>(), "Expected !IsSigned");
|
||||
}
|
||||
};
|
||||
|
||||
struct TestIsSigned {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(!IsFloat<T>(), "Expected !IsFloat");
|
||||
static_assert(IsSigned<T>(), "Expected IsSigned");
|
||||
}
|
||||
};
|
||||
|
||||
struct TestIsFloat {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(IsFloat<T>(), "Expected IsFloat");
|
||||
static_assert(IsSigned<T>(), "Floats are also considered signed");
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllType() {
|
||||
ForUnsignedTypes(TestIsUnsigned());
|
||||
ForSignedTypes(TestIsSigned());
|
||||
ForFloatTypes(TestIsFloat());
|
||||
|
||||
static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
|
||||
static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
|
||||
static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
|
||||
}
|
||||
|
||||
struct TestIsSame {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(IsSame<T, T>(), "T == T");
|
||||
static_assert(!IsSame<MakeSigned<T>, MakeUnsigned<T>>(), "S != U");
|
||||
static_assert(!IsSame<MakeUnsigned<T>, MakeSigned<T>>(), "U != S");
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
|
||||
|
||||
HWY_NOINLINE void TestAllBitScan() {
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
|
||||
HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
|
||||
HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));
|
||||
|
||||
HWY_ASSERT_EQ(size_t{0},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t{0},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{1},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t{1},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
|
||||
HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
|
||||
HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));
|
||||
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
|
||||
HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
|
||||
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t{62},
|
||||
Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t{63},
|
||||
Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPopCount() {
|
||||
HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
|
||||
HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));
|
||||
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
|
||||
HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(BaseTest);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,110 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
#define HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
|
||||
// https://github.com/gperftools/gperftools/issues/946).
|
||||
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
|
||||
#undef HWY_DISABLE_CACHE_CONTROL
|
||||
#define HWY_DISABLE_CACHE_CONTROL
|
||||
#endif
|
||||
|
||||
// intrin.h is sufficient on MSVC and already included by base.h.
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
|
||||
#include <emmintrin.h> // SSE2
|
||||
#endif
|
||||
|
||||
// Windows.h #defines these, which causes infinite recursion. Temporarily
|
||||
// undefine them in this header; these functions are anyway deprecated.
|
||||
// TODO(janwas): remove when these functions are removed.
|
||||
#pragma push_macro("LoadFence")
|
||||
#undef LoadFence
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
|
||||
#define HWY_STREAM_MULTIPLE 16
|
||||
|
||||
// The following functions may also require an attribute.
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
|
||||
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
|
||||
#else
|
||||
#define HWY_ATTR_CACHE
|
||||
#endif
|
||||
|
||||
// Delays subsequent loads until prior loads are visible. Beware of potentially
|
||||
// differing behavior across architectures and vendors: on Intel but not
|
||||
// AMD CPUs, also serves as a full fence (waits for all prior instructions to
|
||||
// complete).
|
||||
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_lfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Ensures values written by previous `Stream` calls are visible on the current
|
||||
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
|
||||
// outputs are to be consumed by other core(s), the producer must publish
|
||||
// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
|
||||
HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_sfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Optionally begins loading the cache line containing "p" to reduce latency of
|
||||
// subsequent actual loads.
|
||||
template <typename T>
|
||||
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
|
||||
#elif HWY_COMPILER_GCC // includes clang
|
||||
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
|
||||
// desirable, so use the default 3 (keep in caches).
|
||||
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
|
||||
#else
|
||||
(void)p;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Invalidates and flushes the cache line containing "p", if possible.
|
||||
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_clflush(p);
|
||||
#else
|
||||
(void)p;
|
||||
#endif
|
||||
}
|
||||
|
||||
// When called inside a spin-loop, may reduce power consumption.
|
||||
HWY_INLINE HWY_ATTR_CACHE void Pause() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_pause();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
// TODO(janwas): remove when these functions are removed. (See above.)
|
||||
#pragma pop_macro("LoadFence")
|
||||
|
||||
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
@@ -0,0 +1,136 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These functions avoid having to write a loop plus remainder handling in the
|
||||
// (unfortunately still common) case where arrays are not aligned/padded. If the
|
||||
// inputs are known to be aligned/padded, it is more efficient to write a single
|
||||
// loop using Load(). We do not provide a CopyAlignedPadded because it
|
||||
// would be more verbose than such a loop.
|
||||
|
||||
// Fills `to`[0, `count`) with `value`.
|
||||
template <class D, typename T = TFromD<D>>
|
||||
void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> v = Set(d, value);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
StoreU(v, d, to + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
SafeFillN(remaining, value, d, to + idx);
|
||||
}
|
||||
|
||||
// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
|
||||
template <class D, typename T = TFromD<D>>
|
||||
void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, from + idx);
|
||||
StoreU(v, d, to + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
SafeCopyN(remaining, d, from + idx, to + idx);
|
||||
}
|
||||
|
||||
// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
|
||||
// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
|
||||
// of the newly written elements in `to`.
|
||||
//
|
||||
// `func` is either a functor with a templated operator()(d, v) returning a
|
||||
// mask, or a generic lambda if using C++14. Due to apparent limitations of
|
||||
// Clang on Windows, it is currently necessary to add HWY_ATTR before the
|
||||
// opening { of the lambda to avoid errors about "function .. requires target".
|
||||
//
|
||||
// NOTE: this is only supported for 16-, 32- or 64-bit types.
|
||||
// NOTE: Func may be called a second time for elements it has already seen, but
|
||||
// these elements will not be written to `to` again.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
|
||||
const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, from + idx);
|
||||
to += CompressBlendedStore(v, func(d, v), d, to);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return to;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
// Workaround for -Waggressive-loop-optimizations on GCC 8
|
||||
// (iteration 2305843009213693951 invokes undefined behavior for T=i64)
|
||||
const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
|
||||
const T* HWY_RESTRICT from_idx =
|
||||
reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
|
||||
const V1 v = LoadU(d1, from_idx);
|
||||
// Avoid storing to `to` unless we know it should be kept - otherwise, we
|
||||
// might overrun the end if it was allocated for the exact count.
|
||||
if (CountTrue(d1, func(d1, v)) == 0) continue;
|
||||
StoreU(v, d1, to);
|
||||
to += 1;
|
||||
}
|
||||
#else
|
||||
// Start index of the last unaligned whole vector, ending at the array end.
|
||||
const size_t last = count - N;
|
||||
// Number of elements before `from` or already written.
|
||||
const size_t invalid = idx - last;
|
||||
HWY_DASSERT(0 != invalid && invalid < N);
|
||||
const Mask<D> mask = Not(FirstN(d, invalid));
|
||||
const Vec<D> v = MaskedLoad(mask, d, from + last);
|
||||
to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
|
||||
#endif
|
||||
return to;
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
||||
@@ -0,0 +1,199 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/algo/copy-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// If your project requires C++14 or later, you can ignore this and pass lambdas
|
||||
// directly to Transform, without requiring an lvalue as we do here for C++11.
|
||||
#if __cplusplus < 201402L
|
||||
#define HWY_GENERIC_LAMBDA 0
|
||||
#else
|
||||
#define HWY_GENERIC_LAMBDA 1
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Returns random integer in [0, 128), which fits in any lane type.
|
||||
template <typename T>
|
||||
T Random7Bit(RandomState& rng) {
|
||||
return static_cast<T>(Random32(&rng) & 127);
|
||||
}
|
||||
|
||||
// In C++14, we can instead define these as generic lambdas next to where they
|
||||
// are invoked.
|
||||
#if !HWY_GENERIC_LAMBDA
|
||||
|
||||
struct IsOdd {
|
||||
template <class D, class V>
|
||||
Mask<D> operator()(D d, V v) const {
|
||||
return TestBit(v, Set(d, TFromD<D>{1}));
|
||||
}
|
||||
};
|
||||
|
||||
#endif // !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from
|
||||
// ForFloatTypes.
|
||||
template <class Test>
|
||||
struct ForeachCountAndMisalign {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
for (size_t count = 0; count < 2 * N; ++count) {
|
||||
for (size_t ma : misalignments) {
|
||||
for (size_t mb : misalignments) {
|
||||
Test()(d, count, ma, mb, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestFill {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// HWY_MAX prevents error when misalign == count == 0.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* expected = pa.get() + misalign_a;
|
||||
const T value = Random7Bit<T>(rng);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = value;
|
||||
}
|
||||
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1);
|
||||
T* actual = pb.get() + misalign_b;
|
||||
|
||||
actual[count] = T{0}; // sentinel
|
||||
Fill(d, value, count, actual);
|
||||
HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllFill() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>());
|
||||
}
|
||||
|
||||
struct TestCopy {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random7Bit<T>(rng);
|
||||
}
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
|
||||
T* b = pb.get() + misalign_b;
|
||||
|
||||
Copy(d, a, count, b);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllCopy() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>());
|
||||
}
|
||||
|
||||
struct TestCopyIf {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random7Bit<T>(rng);
|
||||
}
|
||||
const size_t padding = Lanes(ScalableTag<T>());
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding));
|
||||
T* b = pb.get() + misalign_b;
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
size_t num_odd = 0;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
if (a[i] & 1) {
|
||||
expected[num_odd++] = a[i];
|
||||
}
|
||||
}
|
||||
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto is_odd = [](const auto d, const auto v) HWY_ATTR {
|
||||
return TestBit(v, Set(d, TFromD<decltype(d)>{1}));
|
||||
};
|
||||
#else
|
||||
const IsOdd is_odd;
|
||||
#endif
|
||||
T* end = CopyIf(d, a, count, b, is_odd);
|
||||
const size_t num_written = static_cast<size_t>(end - b);
|
||||
HWY_ASSERT_EQ(num_odd, num_written);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllCopyIf() {
|
||||
ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(CopyTest);
|
||||
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill);
|
||||
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy);
|
||||
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,109 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Returns index of the first element equal to `value` in `in[0, count)`, or
|
||||
// `count` if not found.
|
||||
template <class D, typename T = TFromD<D>>
|
||||
size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> broadcasted = Set(d, value);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i)));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
}
|
||||
|
||||
if (i != count) {
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Scan single elements.
|
||||
const CappedTag<T, 1> d1;
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 broadcasted1 = Set(d1, GetLane(broadcasted));
|
||||
for (; i < count; ++i) {
|
||||
if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - i;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, in + i);
|
||||
// Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
|
||||
const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
#endif // HWY_MEM_OPS_MIGHT_FAULT
|
||||
}
|
||||
|
||||
return count; // not found
|
||||
}
|
||||
|
||||
// Returns index of the first element in `in[0, count)` for which `func(d, vec)`
|
||||
// returns true, otherwise `count`.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i)));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
}
|
||||
|
||||
if (i != count) {
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Scan single elements.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; i < count; ++i) {
|
||||
if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - i;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, in + i);
|
||||
// Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
|
||||
const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
#endif // HWY_MEM_OPS_MIGHT_FAULT
|
||||
}
|
||||
|
||||
return count; // not found
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
||||
@@ -0,0 +1,219 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/print.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/algo/find-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// If your project requires C++14 or later, you can ignore this and pass lambdas
|
||||
// directly to FindIf, without requiring an lvalue as we do here for C++11.
|
||||
#if __cplusplus < 201402L
|
||||
#define HWY_GENERIC_LAMBDA 0
|
||||
#else
|
||||
#define HWY_GENERIC_LAMBDA 1
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Returns random number in [-8, 8) - we use knowledge of the range to Find()
|
||||
// values we know are not present.
|
||||
template <typename T>
|
||||
T Random(RandomState& rng) {
|
||||
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
|
||||
const double val = (bits - 512) / 64.0;
|
||||
// Clamp negative to zero for unsigned types.
|
||||
return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
|
||||
}
|
||||
|
||||
// In C++14, we can instead define these as generic lambdas next to where they
|
||||
// are invoked.
|
||||
#if !HWY_GENERIC_LAMBDA
|
||||
|
||||
class GreaterThan {
|
||||
public:
|
||||
GreaterThan(int val) : val_(val) {}
|
||||
template <class D, class V>
|
||||
Mask<D> operator()(D d, V v) const {
|
||||
return Gt(v, Set(d, static_cast<TFromD<D>>(val_)));
|
||||
}
|
||||
|
||||
private:
|
||||
int val_;
|
||||
};
|
||||
|
||||
#endif // !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Invokes Test (e.g. TestFind) with all arg combinations.
|
||||
template <class Test>
|
||||
struct ForeachCountAndMisalign {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
// Find() checks 8 vectors at a time, so we want to cover a fairly large
|
||||
// range without oversampling (checking every possible count).
|
||||
std::vector<size_t> counts(AdjustedReps(512));
|
||||
for (size_t& count : counts) {
|
||||
count = static_cast<size_t>(rng()) % (16 * N + 1);
|
||||
}
|
||||
counts[0] = 0; // ensure we test count=0.
|
||||
|
||||
for (size_t count : counts) {
|
||||
for (size_t m : misalignments) {
|
||||
Test()(d, count, m, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestFind {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Must allocate at least one even if count is zero.
|
||||
AlignedFreeUniquePtr<T[]> storage =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign + count));
|
||||
T* in = storage.get() + misalign;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
in[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
// For each position, search for that element (which we know is there)
|
||||
for (size_t pos = 0; pos < count; ++pos) {
|
||||
const size_t actual = Find(d, in[pos], in, count);
|
||||
|
||||
// We may have found an earlier occurrence of the same value; ensure the
|
||||
// value is the same, and that it is the first.
|
||||
if (!IsEqual(in[pos], in[actual])) {
|
||||
fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
|
||||
static_cast<double>(in[actual]), static_cast<int>(actual),
|
||||
static_cast<double>(in[pos]));
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
for (size_t i = 0; i < actual; ++i) {
|
||||
if (IsEqual(in[i], in[pos])) {
|
||||
fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
|
||||
static_cast<double>(in[i]), static_cast<int>(i),
|
||||
static_cast<int>(actual));
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also search for values we know not to be present (out of range)
|
||||
HWY_ASSERT_EQ(count, Find(d, T{9}, in, count));
|
||||
HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count));
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllFind() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>());
|
||||
}
|
||||
|
||||
struct TestFindIf {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
using TI = MakeSigned<T>;
|
||||
// Must allocate at least one even if count is zero.
|
||||
AlignedFreeUniquePtr<T[]> storage =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign + count));
|
||||
T* in = storage.get() + misalign;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
in[i] = Random<T>(rng);
|
||||
HWY_ASSERT(in[i] < 8);
|
||||
HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8);
|
||||
}
|
||||
|
||||
bool found_any = false;
|
||||
bool not_found_any = false;
|
||||
|
||||
// unsigned T would be promoted to signed and compare greater than any
|
||||
// negative val, whereas Set() would just cast to an unsigned value and the
|
||||
// comparison remains unsigned, so avoid negative numbers there.
|
||||
const int min_val = IsSigned<T>() ? -9 : 0;
|
||||
// Includes out-of-range value 9 to test the not-found path.
|
||||
for (int val = min_val; val <= 9; ++val) {
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto greater = [val](const auto d, const auto v) HWY_ATTR {
|
||||
return Gt(v, Set(d, static_cast<T>(val)));
|
||||
};
|
||||
#else
|
||||
const GreaterThan greater(val);
|
||||
#endif
|
||||
const size_t actual = FindIf(d, in, count, greater);
|
||||
found_any |= actual < count;
|
||||
not_found_any |= actual == count;
|
||||
|
||||
const auto pos = std::find_if(
|
||||
in, in + count, [val](T x) { return x > static_cast<T>(val); });
|
||||
// Convert returned iterator to index.
|
||||
const size_t expected = static_cast<size_t>(pos - in);
|
||||
if (expected != actual) {
|
||||
fprintf(stderr, "%s count %d val %d, expected %d actual %d\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
|
||||
val, static_cast<int>(expected), static_cast<int>(actual));
|
||||
hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count,
|
||||
0, count);
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// We will always not-find something due to val=9.
|
||||
HWY_ASSERT(not_found_any);
|
||||
// We'll find something unless the input is empty or {0} - because 0 > i
|
||||
// is false for all i=[0,9].
|
||||
if (count != 0 && in[0] != 0) {
|
||||
HWY_ASSERT(found_any);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllFindIf() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(FindTest);
|
||||
HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind);
|
||||
HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,262 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These functions avoid having to write a loop plus remainder handling in the
|
||||
// (unfortunately still common) case where arrays are not aligned/padded. If the
|
||||
// inputs are known to be aligned/padded, it is more efficient to write a single
|
||||
// loop using Load(). We do not provide a TransformAlignedPadded because it
|
||||
// would be more verbose than such a loop.
|
||||
//
|
||||
// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
|
||||
// generic lambda if using C++14. Due to apparent limitations of Clang on
|
||||
// Windows, it is currently necessary to add HWY_ATTR before the opening { of
|
||||
// the lambda to avoid errors about "always_inline function .. requires target".
|
||||
//
|
||||
// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
|
||||
// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
|
||||
// vector.
|
||||
|
||||
// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
|
||||
// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
|
||||
// the value of its lane i is i, and increases by `Lanes(d)` after every call.
|
||||
// Note that some of these indices may be `>= count`, but the elements that
|
||||
// `func` returns in those lanes will not be written to `out`.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
|
||||
const RebindToUnsigned<D> du;
|
||||
using TU = TFromD<decltype(du)>;
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
Vec<decltype(du)> vidx = Iota(du, 0);
|
||||
for (; idx + N <= count; idx += N) {
|
||||
StoreU(func(d, vidx), d, out + idx);
|
||||
vidx = Add(vidx, Set(du, static_cast<TU>(N)));
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
const RebindToUnsigned<decltype(d1)> du1;
|
||||
for (; idx < count; ++idx) {
|
||||
StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
BlendedStore(func(d, vidx), mask, d, out + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
|
||||
// array elements by a constant.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, inout + idx);
|
||||
StoreU(func(d, v), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
StoreU(func(d1, v), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
BlendedStore(func(d, v), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
|
||||
// multiplying array elements by those of another array.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
|
||||
const T* HWY_RESTRICT in1, const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, inout + idx);
|
||||
const Vec<D> v1 = LoadU(d, in1 + idx);
|
||||
StoreU(func(d, v, v1), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
const V1 v1 = LoadU(d1, in1 + idx);
|
||||
StoreU(func(d1, v, v1), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
|
||||
BlendedStore(func(d, v, v1), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
|
||||
// usage: FMA of elements from three arrays, stored into the first array.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
|
||||
const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
|
||||
const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, inout + idx);
|
||||
const Vec<D> v1 = LoadU(d, in1 + idx);
|
||||
const Vec<D> v2 = LoadU(d, in2 + idx);
|
||||
StoreU(func(d, v, v1, v2), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
const V1 v1 = LoadU(d1, in1 + idx);
|
||||
const V1 v2 = LoadU(d1, in2 + idx);
|
||||
StoreU(func(d1, v, v1, v2), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
|
||||
const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
|
||||
BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class D, typename T = TFromD<D>>
|
||||
void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> old_v = Set(d, old_t);
|
||||
const Vec<D> new_v = Set(d, new_t);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
Vec<D> v = LoadU(d, inout + idx);
|
||||
StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
|
||||
const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v1 = LoadU(d1, inout + idx);
|
||||
StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
|
||||
const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> new_v = Set(d, new_t);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
Vec<D> v = LoadU(d, inout + idx);
|
||||
StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
||||
@@ -0,0 +1,372 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" //NOLINT
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/algo/transform-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// If your project requires C++14 or later, you can ignore this and pass lambdas
|
||||
// directly to Transform, without requiring an lvalue as we do here for C++11.
|
||||
#if __cplusplus < 201402L
|
||||
#define HWY_GENERIC_LAMBDA 0
|
||||
#else
|
||||
#define HWY_GENERIC_LAMBDA 1
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T>
|
||||
T Alpha() {
|
||||
return static_cast<T>(1.5); // arbitrary scalar
|
||||
}
|
||||
|
||||
// Returns random floating-point number in [-8, 8) to ensure computations do
|
||||
// not exceed float32 precision.
|
||||
template <typename T>
|
||||
T Random(RandomState& rng) {
|
||||
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
|
||||
const double val = (bits - 512) / 64.0;
|
||||
// Clamp negative to zero for unsigned types.
|
||||
return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
|
||||
}
|
||||
|
||||
// SCAL, AXPY names are from BLAS.
|
||||
template <typename T>
|
||||
HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
out[i] = Alpha<T>() * x[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
out[i] = Alpha<T>() * x[i] + y[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out,
|
||||
size_t count) {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
out[i] = x[i] * y[i] + z[i];
|
||||
}
|
||||
}
|
||||
|
||||
// In C++14, we can instead define these as generic lambdas next to where they
|
||||
// are invoked.
|
||||
#if !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Generator that returns even numbers by doubling the output indices.
|
||||
struct Gen2 {
|
||||
template <class D, class VU>
|
||||
Vec<D> operator()(D d, VU vidx) const {
|
||||
return BitCast(d, Add(vidx, vidx));
|
||||
}
|
||||
};
|
||||
|
||||
struct SCAL {
|
||||
template <class D, class V>
|
||||
Vec<D> operator()(D d, V v) const {
|
||||
using T = TFromD<D>;
|
||||
return Mul(Set(d, Alpha<T>()), v);
|
||||
}
|
||||
};
|
||||
|
||||
struct AXPY {
|
||||
template <class D, class V>
|
||||
Vec<D> operator()(D d, V v, V v1) const {
|
||||
using T = TFromD<D>;
|
||||
return MulAdd(Set(d, Alpha<T>()), v, v1);
|
||||
}
|
||||
};
|
||||
|
||||
struct FMA4 {
|
||||
template <class D, class V>
|
||||
Vec<D> operator()(D /*d*/, V v, V v1, V v2) const {
|
||||
return MulAdd(v, v1, v2);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from
|
||||
// ForFloatTypes.
|
||||
template <class Test>
|
||||
struct ForeachCountAndMisalign {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
for (size_t count = 0; count < 2 * N; ++count) {
|
||||
for (size_t ma : misalignments) {
|
||||
for (size_t mb : misalignments) {
|
||||
Test()(d, count, ma, mb, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Output-only, no loads
|
||||
struct TestGenerate {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/,
|
||||
RandomState& /*rng*/) {
|
||||
using T = TFromD<D>;
|
||||
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1);
|
||||
T* actual = pa.get() + misalign_a;
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = static_cast<T>(2 * i);
|
||||
}
|
||||
|
||||
// TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
|
||||
// the attribute also applies to lambdas? If so, remove HWY_ATTR.
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto gen2 = [](const auto d, const auto vidx)
|
||||
HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
|
||||
#else
|
||||
const Gen2 gen2;
|
||||
#endif
|
||||
actual[count] = T{0}; // sentinel
|
||||
Generate(d, actual, count, gen2);
|
||||
HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), actual, count,
|
||||
target_name, __FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
// Zero extra input arrays
|
||||
struct TestTransform {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
if (misalign_b != 0) return;
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
SimpleSCAL(a, expected.get(), count);
|
||||
|
||||
// TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
|
||||
// the attribute also applies to lambdas? If so, remove HWY_ATTR.
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto scal = [](const auto d, const auto v)
|
||||
HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); };
|
||||
#else
|
||||
const SCAL scal;
|
||||
#endif
|
||||
Transform(d, a, count, scal);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
// One extra input array
|
||||
struct TestTransform1 {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
T* b = pb.get() + misalign_b;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
b[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
SimpleAXPY(a, b, expected.get(), count);
|
||||
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
|
||||
return MulAdd(Set(d, Alpha<T>()), v, v1);
|
||||
};
|
||||
#else
|
||||
const AXPY axpy;
|
||||
#endif
|
||||
Transform1(d, a, count, b, axpy);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
// Two extra input arrays
|
||||
struct TestTransform2 {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
|
||||
AlignedFreeUniquePtr<T[]> pc =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
T* b = pb.get() + misalign_b;
|
||||
T* c = pc.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
b[i] = Random<T>(rng);
|
||||
c[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
SimpleFMA4(a, b, c, expected.get(), count);
|
||||
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2)
|
||||
HWY_ATTR { return MulAdd(v, v1, v2); };
|
||||
#else
|
||||
const FMA4 fma4;
|
||||
#endif
|
||||
Transform2(d, a, count, b, c, fma4);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class IfEq {
|
||||
public:
|
||||
IfEq(T val) : val_(val) {}
|
||||
|
||||
template <class D, class V>
|
||||
Mask<D> operator()(D d, V v) const {
|
||||
return Eq(v, Set(d, val_));
|
||||
}
|
||||
|
||||
private:
|
||||
T val_;
|
||||
};
|
||||
|
||||
struct TestReplace {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
if (misalign_b != 0) return;
|
||||
if (count == 0) return;
|
||||
using T = TFromD<D>;
|
||||
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count);
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
}
|
||||
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count);
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count);
|
||||
|
||||
std::vector<size_t> positions(AdjustedReps(count));
|
||||
for (size_t& pos : positions) {
|
||||
pos = static_cast<size_t>(rng()) % count;
|
||||
}
|
||||
|
||||
for (size_t pos = 0; pos < count; ++pos) {
|
||||
const T old_t = a[pos];
|
||||
const T new_t = Random<T>(rng);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = IsEqual(a[i], old_t) ? new_t : a[i];
|
||||
}
|
||||
|
||||
// Copy so ReplaceIf gets the same input (and thus also outputs expected)
|
||||
memcpy(pb.get(), a, count * sizeof(T));
|
||||
|
||||
Replace(d, a, count, new_t, old_t);
|
||||
HWY_ASSERT_ARRAY_EQ(expected.get(), a, count);
|
||||
|
||||
ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t));
|
||||
HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllGenerate() {
|
||||
// The test BitCast-s the indices, which does not work for floats.
|
||||
ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>());
|
||||
}
|
||||
|
||||
void TestAllTransform() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>());
|
||||
}
|
||||
|
||||
void TestAllTransform1() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>());
|
||||
}
|
||||
|
||||
void TestAllTransform2() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>());
|
||||
}
|
||||
|
||||
void TestAllReplace() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(TransformTest);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,252 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Include guard (still compiled once per target)
|
||||
#include <cmath>
|
||||
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct Dot {
|
||||
// Specify zero or more of these, ORed together, as the kAssumptions template
|
||||
// argument to Compute. Each one may improve performance or reduce code size,
|
||||
// at the cost of additional requirements on the arguments.
|
||||
enum Assumptions {
|
||||
// num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T).
|
||||
kAtLeastOneVector = 1,
|
||||
// num_elements is divisible by N (a power of two, so this can be used if
|
||||
// the problem size is known to be a power of two >= HWY_MAX_BYTES /
|
||||
// sizeof(T)).
|
||||
kMultipleOfVector = 2,
|
||||
// RoundUpTo(num_elements, N) elements are accessible; their value does not
|
||||
// matter (will be treated as if they were zero).
|
||||
kPaddedToVector = 4,
|
||||
};
|
||||
|
||||
// Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the
|
||||
// pointers to a multiple of N elements is helpful but not required.
|
||||
template <int kAssumptions, class D, typename T = TFromD<D>,
|
||||
HWY_IF_NOT_LANE_SIZE_D(D, 2)>
|
||||
static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
|
||||
const T* const HWY_RESTRICT pb,
|
||||
const size_t num_elements) {
|
||||
static_assert(IsFloat<T>(), "MulAdd requires float type");
|
||||
using V = decltype(Zero(d));
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
size_t i = 0;
|
||||
|
||||
constexpr bool kIsAtLeastOneVector =
|
||||
(kAssumptions & kAtLeastOneVector) != 0;
|
||||
constexpr bool kIsMultipleOfVector =
|
||||
(kAssumptions & kMultipleOfVector) != 0;
|
||||
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
|
||||
|
||||
// Won't be able to do a full vector load without padding => scalar loop.
|
||||
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
|
||||
HWY_UNLIKELY(num_elements < N)) {
|
||||
// Only 2x unroll to avoid excessive code size.
|
||||
T sum0 = T(0);
|
||||
T sum1 = T(0);
|
||||
for (; i + 2 <= num_elements; i += 2) {
|
||||
sum0 += pa[i + 0] * pb[i + 0];
|
||||
sum1 += pa[i + 1] * pb[i + 1];
|
||||
}
|
||||
if (i < num_elements) {
|
||||
sum1 += pa[i] * pb[i];
|
||||
}
|
||||
return sum0 + sum1;
|
||||
}
|
||||
|
||||
// Compiler doesn't make independent sum* accumulators, so unroll manually.
|
||||
// 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive
|
||||
// for unaligned inputs (each unaligned pointer halves the throughput
|
||||
// because it occupies both L1 load ports for a cycle). We cannot have
|
||||
// arrays of vectors on RVV/SVE, so always unroll 4x.
|
||||
V sum0 = Zero(d);
|
||||
V sum1 = Zero(d);
|
||||
V sum2 = Zero(d);
|
||||
V sum3 = Zero(d);
|
||||
|
||||
// Main loop: unrolled
|
||||
for (; i + 4 * N <= num_elements; /* i += 4 * N */) { // incr in loop
|
||||
const auto a0 = LoadU(d, pa + i);
|
||||
const auto b0 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = MulAdd(a0, b0, sum0);
|
||||
const auto a1 = LoadU(d, pa + i);
|
||||
const auto b1 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum1 = MulAdd(a1, b1, sum1);
|
||||
const auto a2 = LoadU(d, pa + i);
|
||||
const auto b2 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum2 = MulAdd(a2, b2, sum2);
|
||||
const auto a3 = LoadU(d, pa + i);
|
||||
const auto b3 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum3 = MulAdd(a3, b3, sum3);
|
||||
}
|
||||
|
||||
// Up to 3 iterations of whole vectors
|
||||
for (; i + N <= num_elements; i += N) {
|
||||
const auto a = LoadU(d, pa + i);
|
||||
const auto b = LoadU(d, pb + i);
|
||||
sum0 = MulAdd(a, b, sum0);
|
||||
}
|
||||
|
||||
if (!kIsMultipleOfVector) {
|
||||
const size_t remaining = num_elements - i;
|
||||
if (remaining != 0) {
|
||||
if (kIsPaddedToVector) {
|
||||
const auto mask = FirstN(d, remaining);
|
||||
const auto a = LoadU(d, pa + i);
|
||||
const auto b = LoadU(d, pb + i);
|
||||
sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
|
||||
} else {
|
||||
// Unaligned load such that the last element is in the highest lane -
|
||||
// ensures we do not touch any elements outside the valid range.
|
||||
// If we get here, then num_elements >= N.
|
||||
HWY_DASSERT(i >= N);
|
||||
i += remaining - N;
|
||||
const auto skip = FirstN(d, N - remaining);
|
||||
const auto a = LoadU(d, pa + i); // always unaligned
|
||||
const auto b = LoadU(d, pb + i);
|
||||
sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1);
|
||||
}
|
||||
}
|
||||
} // kMultipleOfVector
|
||||
|
||||
// Reduction tree: sum of all accumulators by pairs, then across lanes.
|
||||
sum0 = Add(sum0, sum1);
|
||||
sum2 = Add(sum2, sum3);
|
||||
sum0 = Add(sum0, sum2);
|
||||
return GetLane(SumOfLanes(d, sum0));
|
||||
}
|
||||
|
||||
// Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a
|
||||
// multiple of N elements is helpful but not required.
|
||||
template <int kAssumptions, class D>
|
||||
static HWY_INLINE float Compute(const D d,
|
||||
const bfloat16_t* const HWY_RESTRICT pa,
|
||||
const bfloat16_t* const HWY_RESTRICT pb,
|
||||
const size_t num_elements) {
|
||||
const RebindToUnsigned<D> du16;
|
||||
const Repartition<float, D> df32;
|
||||
|
||||
using V = decltype(Zero(df32));
|
||||
const size_t N = Lanes(d);
|
||||
size_t i = 0;
|
||||
|
||||
constexpr bool kIsAtLeastOneVector =
|
||||
(kAssumptions & kAtLeastOneVector) != 0;
|
||||
constexpr bool kIsMultipleOfVector =
|
||||
(kAssumptions & kMultipleOfVector) != 0;
|
||||
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
|
||||
|
||||
// Won't be able to do a full vector load without padding => scalar loop.
|
||||
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
|
||||
HWY_UNLIKELY(num_elements < N)) {
|
||||
float sum0 = 0.0f; // Only 2x unroll to avoid excessive code size for..
|
||||
float sum1 = 0.0f; // this unlikely(?) case.
|
||||
for (; i + 2 <= num_elements; i += 2) {
|
||||
sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
|
||||
sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
|
||||
}
|
||||
if (i < num_elements) {
|
||||
sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
|
||||
}
|
||||
return sum0 + sum1;
|
||||
}
|
||||
|
||||
// See comment in the other Compute() overload. Unroll 2x, but we need
|
||||
// twice as many sums for ReorderWidenMulAccumulate.
|
||||
V sum0 = Zero(df32);
|
||||
V sum1 = Zero(df32);
|
||||
V sum2 = Zero(df32);
|
||||
V sum3 = Zero(df32);
|
||||
|
||||
// Main loop: unrolled
|
||||
for (; i + 2 * N <= num_elements; /* i += 2 * N */) { // incr in loop
|
||||
const auto a0 = LoadU(d, pa + i);
|
||||
const auto b0 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
|
||||
const auto a1 = LoadU(d, pa + i);
|
||||
const auto b1 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
|
||||
}
|
||||
|
||||
// Possibly one more iteration of whole vectors
|
||||
if (i + N <= num_elements) {
|
||||
const auto a0 = LoadU(d, pa + i);
|
||||
const auto b0 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
|
||||
}
|
||||
|
||||
if (!kIsMultipleOfVector) {
|
||||
const size_t remaining = num_elements - i;
|
||||
if (remaining != 0) {
|
||||
if (kIsPaddedToVector) {
|
||||
const auto mask = FirstN(du16, remaining);
|
||||
const auto va = LoadU(d, pa + i);
|
||||
const auto vb = LoadU(d, pb + i);
|
||||
const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
|
||||
const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
|
||||
|
||||
} else {
|
||||
// Unaligned load such that the last element is in the highest lane -
|
||||
// ensures we do not touch any elements outside the valid range.
|
||||
// If we get here, then num_elements >= N.
|
||||
HWY_DASSERT(i >= N);
|
||||
i += remaining - N;
|
||||
const auto skip = FirstN(du16, N - remaining);
|
||||
const auto va = LoadU(d, pa + i); // always unaligned
|
||||
const auto vb = LoadU(d, pb + i);
|
||||
const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va)));
|
||||
const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb)));
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
|
||||
}
|
||||
}
|
||||
} // kMultipleOfVector
|
||||
|
||||
// Reduction tree: sum of all accumulators by pairs, then across lanes.
|
||||
sum0 = Add(sum0, sum1);
|
||||
sum2 = Add(sum2, sum3);
|
||||
sum0 = Add(sum0, sum2);
|
||||
return GetLane(SumOfLanes(df32, sum0));
|
||||
}
|
||||
};
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
||||
@@ -0,0 +1,167 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/dot/dot-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
sum += pa[i] * pb[i];
|
||||
}
|
||||
return static_cast<T>(sum);
|
||||
}
|
||||
|
||||
HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb,
|
||||
size_t num) {
|
||||
float sum = 0.0f;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SetValue(const float value, T* HWY_RESTRICT ptr) {
|
||||
*ptr = static_cast<T>(value);
|
||||
}
|
||||
void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) {
|
||||
*ptr = BF16FromF32(value);
|
||||
}
|
||||
|
||||
class TestDot {
|
||||
// Computes/verifies one dot product.
|
||||
template <int kAssumptions, class D>
|
||||
void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
const auto random_t = [&rng]() {
|
||||
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
|
||||
return static_cast<float>(bits - 512) * (1.0f / 64);
|
||||
};
|
||||
|
||||
const size_t padded =
|
||||
(kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
|
||||
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
|
||||
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded);
|
||||
T* a = pa.get() + misalign_a;
|
||||
T* b = pb.get() + misalign_b;
|
||||
size_t i = 0;
|
||||
for (; i < num; ++i) {
|
||||
SetValue(random_t(), a + i);
|
||||
SetValue(random_t(), b + i);
|
||||
}
|
||||
// Fill padding with NaN - the values are not used, but avoids MSAN errors.
|
||||
for (; i < padded; ++i) {
|
||||
ScalableTag<float> df1;
|
||||
SetValue(GetLane(NaN(df1)), a + i);
|
||||
SetValue(GetLane(NaN(df1)), b + i);
|
||||
}
|
||||
|
||||
const auto expected = SimpleDot(a, b, num);
|
||||
const auto actual = Dot::Compute<kAssumptions>(d, a, b, num);
|
||||
const auto max = static_cast<decltype(actual)>(8 * 8 * num);
|
||||
HWY_ASSERT(-max <= actual && actual <= max);
|
||||
HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
|
||||
}
|
||||
|
||||
// Runs tests with various alignments.
|
||||
template <int kAssumptions, class D>
|
||||
void ForeachMisalign(D d, size_t num, RandomState& rng) {
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
for (size_t ma : misalignments) {
|
||||
for (size_t mb : misalignments) {
|
||||
Test<kAssumptions>(d, num, ma, mb, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs tests with various lengths compatible with the given assumptions.
|
||||
template <int kAssumptions, class D>
|
||||
void ForeachCount(D d, RandomState& rng) {
|
||||
const size_t N = Lanes(d);
|
||||
const size_t counts[] = {1,
|
||||
3,
|
||||
7,
|
||||
16,
|
||||
HWY_MAX(N / 2, 1),
|
||||
HWY_MAX(2 * N / 3, 1),
|
||||
N,
|
||||
N + 1,
|
||||
4 * N / 3,
|
||||
3 * N,
|
||||
8 * N,
|
||||
8 * N + 2};
|
||||
for (size_t num : counts) {
|
||||
if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
|
||||
if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
|
||||
ForeachMisalign<kAssumptions>(d, num, rng);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
// All 8 combinations of the three length-related flags:
|
||||
ForeachCount<0>(d, rng);
|
||||
ForeachCount<Dot::kAtLeastOneVector>(d, rng);
|
||||
ForeachCount<Dot::kMultipleOfVector>(d, rng);
|
||||
ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
|
||||
Dot::kAtLeastOneVector>(d, rng);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); }
|
||||
void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(DotTest);
|
||||
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
|
||||
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,145 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/image/image.h"
|
||||
|
||||
#include <algorithm> // swap
|
||||
#include <cstddef>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
size_t GetVectorSize() { return Lanes(ScalableTag<uint8_t>()); }
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(GetVectorSize); // Local function.
|
||||
} // namespace
|
||||
|
||||
size_t ImageBase::VectorSize() {
|
||||
// Do not cache result - must return the current value, which may be greater
|
||||
// than the first call if it was subject to DisableTargets!
|
||||
return HWY_DYNAMIC_DISPATCH(GetVectorSize)();
|
||||
}
|
||||
|
||||
size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
|
||||
const size_t vec_size = VectorSize();
|
||||
size_t valid_bytes = xsize * sizeof_t;
|
||||
|
||||
// Allow unaligned accesses starting at the last valid value - this may raise
|
||||
// msan errors unless the user calls InitializePaddingForUnalignedAccesses.
|
||||
// Skip for the scalar case because no extra lanes will be loaded.
|
||||
if (vec_size != 1) {
|
||||
HWY_DASSERT(vec_size >= sizeof_t);
|
||||
valid_bytes += vec_size - sizeof_t;
|
||||
}
|
||||
|
||||
// Round up to vector and cache line size.
|
||||
const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT);
|
||||
size_t bytes_per_row = RoundUpTo(valid_bytes, align);
|
||||
|
||||
// During the lengthy window before writes are committed to memory, CPUs
|
||||
// guard against read after write hazards by checking the address, but
|
||||
// only the lower 11 bits. We avoid a false dependency between writes to
|
||||
// consecutive rows by ensuring their sizes are not multiples of 2 KiB.
|
||||
// Avoid2K prevents the same problem for the planes of an Image3.
|
||||
if (bytes_per_row % HWY_ALIGNMENT == 0) {
|
||||
bytes_per_row += align;
|
||||
}
|
||||
|
||||
HWY_DASSERT(bytes_per_row % align == 0);
|
||||
return bytes_per_row;
|
||||
}
|
||||
|
||||
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
|
||||
const size_t sizeof_t)
|
||||
: xsize_(static_cast<uint32_t>(xsize)),
|
||||
ysize_(static_cast<uint32_t>(ysize)),
|
||||
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
|
||||
HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
|
||||
|
||||
bytes_per_row_ = 0;
|
||||
// Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
|
||||
// if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
|
||||
if (xsize != 0 && ysize != 0) {
|
||||
bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
|
||||
bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize);
|
||||
HWY_ASSERT(bytes_.get() != nullptr);
|
||||
InitializePadding(sizeof_t, Padding::kRoundUp);
|
||||
}
|
||||
}
|
||||
|
||||
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
|
||||
const size_t bytes_per_row, void* const aligned)
|
||||
: xsize_(static_cast<uint32_t>(xsize)),
|
||||
ysize_(static_cast<uint32_t>(ysize)),
|
||||
bytes_per_row_(bytes_per_row),
|
||||
bytes_(static_cast<uint8_t*>(aligned),
|
||||
AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
|
||||
const size_t vec_size = VectorSize();
|
||||
HWY_ASSERT(bytes_per_row % vec_size == 0);
|
||||
HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0);
|
||||
}
|
||||
|
||||
void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
|
||||
#if HWY_IS_MSAN || HWY_IDE
|
||||
if (xsize_ == 0 || ysize_ == 0) return;
|
||||
|
||||
const size_t vec_size = VectorSize(); // Bytes, independent of sizeof_t!
|
||||
if (vec_size == 1) return; // Scalar mode: no padding needed
|
||||
|
||||
const size_t valid_size = xsize_ * sizeof_t;
|
||||
const size_t initialize_size = padding == Padding::kRoundUp
|
||||
? RoundUpTo(valid_size, vec_size)
|
||||
: valid_size + vec_size - sizeof_t;
|
||||
if (valid_size == initialize_size) return;
|
||||
|
||||
for (size_t y = 0; y < ysize_; ++y) {
|
||||
uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
|
||||
#if defined(__clang__) && (__clang_major__ <= 6)
|
||||
// There's a bug in msan in clang-6 when handling AVX2 operations. This
|
||||
// workaround allows tests to pass on msan, although it is slower and
|
||||
// prevents msan warnings from uninitialized images.
|
||||
memset(row, 0, initialize_size);
|
||||
#else
|
||||
memset(row + valid_size, 0, initialize_size - valid_size);
|
||||
#endif // clang6
|
||||
}
|
||||
#else
|
||||
(void)sizeof_t;
|
||||
(void)padding;
|
||||
#endif // HWY_IS_MSAN
|
||||
}
|
||||
|
||||
void ImageBase::Swap(ImageBase& other) {
|
||||
std::swap(xsize_, other.xsize_);
|
||||
std::swap(ysize_, other.ysize_);
|
||||
std::swap(bytes_per_row_, other.bytes_per_row_);
|
||||
std::swap(bytes_, other.bytes_);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,471 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
|
||||
|
||||
// SIMD/multicore-friendly planar image representation with row accessors.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility> // std::move
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Type-independent parts of Image<> - reduces code duplication and facilitates
|
||||
// moving member function implementations to cc file.
|
||||
struct HWY_CONTRIB_DLLEXPORT ImageBase {
|
||||
// Returns required alignment in bytes for externally allocated memory.
|
||||
static size_t VectorSize();
|
||||
|
||||
// Returns distance [bytes] between the start of two consecutive rows, a
|
||||
// multiple of VectorSize but NOT kAlias (see implementation).
|
||||
static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t);
|
||||
|
||||
// No allocation (for output params or unused images)
|
||||
ImageBase()
|
||||
: xsize_(0),
|
||||
ysize_(0),
|
||||
bytes_per_row_(0),
|
||||
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}
|
||||
|
||||
// Allocates memory (this is the common case)
|
||||
ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);
|
||||
|
||||
// References but does not take ownership of external memory. Useful for
|
||||
// interoperability with other libraries. `aligned` must be aligned to a
|
||||
// multiple of VectorSize() and `bytes_per_row` must also be a multiple of
|
||||
// VectorSize() or preferably equal to BytesPerRow().
|
||||
ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);
|
||||
|
||||
// Copy construction/assignment is forbidden to avoid inadvertent copies,
|
||||
// which can be very expensive. Use CopyImageTo() instead.
|
||||
ImageBase(const ImageBase& other) = delete;
|
||||
ImageBase& operator=(const ImageBase& other) = delete;
|
||||
|
||||
// Move constructor (required for returning Image from function)
|
||||
ImageBase(ImageBase&& other) noexcept = default;
|
||||
|
||||
// Move assignment (required for std::vector)
|
||||
ImageBase& operator=(ImageBase&& other) noexcept = default;
|
||||
|
||||
void Swap(ImageBase& other);
|
||||
|
||||
// Useful for pre-allocating image with some padding for alignment purposes
|
||||
// and later reporting the actual valid dimensions. Caller is responsible
|
||||
// for ensuring xsize/ysize are <= the original dimensions.
|
||||
void ShrinkTo(const size_t xsize, const size_t ysize) {
|
||||
xsize_ = static_cast<uint32_t>(xsize);
|
||||
ysize_ = static_cast<uint32_t>(ysize);
|
||||
// NOTE: we can't recompute bytes_per_row for more compact storage and
|
||||
// better locality because that would invalidate the image contents.
|
||||
}
|
||||
|
||||
// How many pixels.
|
||||
HWY_INLINE size_t xsize() const { return xsize_; }
|
||||
HWY_INLINE size_t ysize() const { return ysize_; }
|
||||
|
||||
// NOTE: do not use this for copying rows - the valid xsize may be much less.
|
||||
HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
|
||||
|
||||
// Raw access to byte contents, for interfacing with other libraries.
|
||||
// Unsigned char instead of char to avoid surprises (sign extension).
|
||||
HWY_INLINE uint8_t* bytes() {
|
||||
void* p = bytes_.get();
|
||||
return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
|
||||
}
|
||||
HWY_INLINE const uint8_t* bytes() const {
|
||||
const void* p = bytes_.get();
|
||||
return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
|
||||
}
|
||||
|
||||
protected:
|
||||
// Returns pointer to the start of a row.
|
||||
HWY_INLINE void* VoidRow(const size_t y) const {
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
if (y >= ysize_) {
|
||||
HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
|
||||
}
|
||||
#endif
|
||||
|
||||
void* row = bytes_.get() + y * bytes_per_row_;
|
||||
return HWY_ASSUME_ALIGNED(row, 64);
|
||||
}
|
||||
|
||||
enum class Padding {
|
||||
// Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
|
||||
kRoundUp,
|
||||
// Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
|
||||
// vector to be initialized. If done by default, this would suppress
|
||||
// legitimate msan warnings. We therefore require users to explicitly call
|
||||
// InitializePadding before using unaligned loads (e.g. convolution).
|
||||
kUnaligned
|
||||
};
|
||||
|
||||
// Initializes the minimum bytes required to suppress msan warnings from
|
||||
// legitimate (according to Padding mode) vector loads/stores on the right
|
||||
// border, where some lanes are uninitialized and assumed to be unused.
|
||||
void InitializePadding(size_t sizeof_t, Padding padding);
|
||||
|
||||
// (Members are non-const to enable assignment during move-assignment.)
|
||||
uint32_t xsize_; // In valid pixels, not including any padding.
|
||||
uint32_t ysize_;
|
||||
size_t bytes_per_row_; // Includes padding.
|
||||
AlignedFreeUniquePtr<uint8_t[]> bytes_;
|
||||
};
|
||||
|
||||
// Single channel, aligned rows separated by padding. T must be POD.
|
||||
//
|
||||
// 'Single channel' (one 2D array per channel) simplifies vectorization
|
||||
// (repeating the same operation on multiple adjacent components) without the
|
||||
// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
|
||||
// can easily iterate over all components in a row and Image requires no
|
||||
// knowledge of the pixel format beyond the component type "T".
|
||||
//
|
||||
// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
|
||||
// false sharing between two threads operating on adjacent rows.
|
||||
//
|
||||
// 'Padding' is still relevant because vectors could potentially be larger than
|
||||
// a cache line. By rounding up row sizes to the vector size, we allow
|
||||
// reading/writing ALIGNED vectors whose first lane is a valid sample. This
|
||||
// avoids needing a separate loop to handle remaining unaligned lanes.
|
||||
//
|
||||
// This image layout could also be achieved with a vector and a row accessor
|
||||
// function, but a class wrapper with support for "deleter" allows wrapping
|
||||
// existing memory allocated by clients without copying the pixels. It also
|
||||
// provides convenient accessors for xsize/ysize, which shortens function
|
||||
// argument lists. Supports move-construction so it can be stored in containers.
|
||||
template <typename ComponentType>
|
||||
class Image : public ImageBase {
|
||||
public:
|
||||
using T = ComponentType;
|
||||
|
||||
Image() = default;
|
||||
Image(const size_t xsize, const size_t ysize)
|
||||
: ImageBase(xsize, ysize, sizeof(T)) {}
|
||||
Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
|
||||
void* aligned)
|
||||
: ImageBase(xsize, ysize, bytes_per_row, aligned) {}
|
||||
|
||||
void InitializePaddingForUnalignedAccesses() {
|
||||
InitializePadding(sizeof(T), Padding::kUnaligned);
|
||||
}
|
||||
|
||||
HWY_INLINE const T* ConstRow(const size_t y) const {
|
||||
return static_cast<const T*>(VoidRow(y));
|
||||
}
|
||||
HWY_INLINE const T* ConstRow(const size_t y) {
|
||||
return static_cast<const T*>(VoidRow(y));
|
||||
}
|
||||
|
||||
// Returns pointer to non-const. This allows passing const Image* parameters
|
||||
// when the callee is only supposed to fill the pixels, as opposed to
|
||||
// allocating or resizing the image.
|
||||
HWY_INLINE T* MutableRow(const size_t y) const {
|
||||
return static_cast<T*>(VoidRow(y));
|
||||
}
|
||||
HWY_INLINE T* MutableRow(const size_t y) {
|
||||
return static_cast<T*>(VoidRow(y));
|
||||
}
|
||||
|
||||
// Returns number of pixels (some of which are padding) per row. Useful for
|
||||
// computing other rows via pointer arithmetic. WARNING: this must
|
||||
// NOT be used to determine xsize.
|
||||
HWY_INLINE intptr_t PixelsPerRow() const {
|
||||
return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
|
||||
}
|
||||
};
|
||||
|
||||
using ImageF = Image<float>;
|
||||
|
||||
// A bundle of 3 same-sized images. To fill an existing Image3 using
|
||||
// single-channel producers, we also need access to each const Image*. Const
|
||||
// prevents breaking the same-size invariant, while still allowing pixels to be
|
||||
// changed via MutableRow.
|
||||
template <typename ComponentType>
|
||||
class Image3 {
|
||||
public:
|
||||
using T = ComponentType;
|
||||
using ImageT = Image<T>;
|
||||
static constexpr size_t kNumPlanes = 3;
|
||||
|
||||
Image3() : planes_{ImageT(), ImageT(), ImageT()} {}
|
||||
|
||||
Image3(const size_t xsize, const size_t ysize)
|
||||
: planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
|
||||
ImageT(xsize, ysize)} {}
|
||||
|
||||
Image3(Image3&& other) noexcept {
|
||||
for (size_t i = 0; i < kNumPlanes; i++) {
|
||||
planes_[i] = std::move(other.planes_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
|
||||
if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
|
||||
HWY_ABORT(
|
||||
"Not same size: %d x %d, %d x %d, %d x %d\n",
|
||||
static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
|
||||
static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
|
||||
static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
|
||||
}
|
||||
planes_[0] = std::move(plane0);
|
||||
planes_[1] = std::move(plane1);
|
||||
planes_[2] = std::move(plane2);
|
||||
}
|
||||
|
||||
// Copy construction/assignment is forbidden to avoid inadvertent copies,
|
||||
// which can be very expensive. Use CopyImageTo instead.
|
||||
Image3(const Image3& other) = delete;
|
||||
Image3& operator=(const Image3& other) = delete;
|
||||
|
||||
Image3& operator=(Image3&& other) noexcept {
|
||||
for (size_t i = 0; i < kNumPlanes; i++) {
|
||||
planes_[i] = std::move(other.planes_[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
|
||||
return static_cast<const T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
|
||||
return static_cast<const T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
|
||||
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
|
||||
return static_cast<T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
|
||||
return static_cast<T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
|
||||
HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }
|
||||
|
||||
void Swap(Image3& other) {
|
||||
for (size_t c = 0; c < 3; ++c) {
|
||||
other.planes_[c].Swap(planes_[c]);
|
||||
}
|
||||
}
|
||||
|
||||
void ShrinkTo(const size_t xsize, const size_t ysize) {
|
||||
for (ImageT& plane : planes_) {
|
||||
plane.ShrinkTo(xsize, ysize);
|
||||
}
|
||||
}
|
||||
|
||||
// Sizes of all three images are guaranteed to be equal.
|
||||
HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
|
||||
HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
|
||||
// Returns offset [bytes] from one row to the next row of the same plane.
|
||||
// WARNING: this must NOT be used to determine xsize, nor for copying rows -
|
||||
// the valid xsize may be much less.
|
||||
HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
|
||||
// Returns number of pixels (some of which are padding) per row. Useful for
|
||||
// computing other rows via pointer arithmetic. WARNING: this must NOT be used
|
||||
// to determine xsize.
|
||||
HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
|
||||
|
||||
private:
|
||||
// Returns pointer to the start of a row.
|
||||
HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
if (c >= kNumPlanes || y >= ysize()) {
|
||||
HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
|
||||
static_cast<int>(y), static_cast<int>(ysize()));
|
||||
}
|
||||
#endif
|
||||
// Use the first plane's stride because the compiler might not realize they
|
||||
// are all equal. Thus we only need a single multiplication for all planes.
|
||||
const size_t row_offset = y * planes_[0].bytes_per_row();
|
||||
const void* row = planes_[c].bytes() + row_offset;
|
||||
return static_cast<const T * HWY_RESTRICT>(
|
||||
HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
|
||||
}
|
||||
|
||||
private:
|
||||
ImageT planes_[kNumPlanes];
|
||||
};
|
||||
|
||||
using Image3F = Image3<float>;
|
||||
|
||||
// Rectangular region in image(s). Factoring this out of Image instead of
|
||||
// shifting the pointer by x0/y0 allows this to apply to multiple images with
|
||||
// different resolutions. Can compare size via SameSize(rect1, rect2).
|
||||
class Rect {
|
||||
public:
|
||||
// Most windows are xsize_max * ysize_max, except those on the borders where
|
||||
// begin + size_max > end.
|
||||
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
|
||||
size_t ysize_max, size_t xend, size_t yend)
|
||||
: x0_(xbegin),
|
||||
y0_(ybegin),
|
||||
xsize_(ClampedSize(xbegin, xsize_max, xend)),
|
||||
ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
|
||||
|
||||
// Construct with origin and known size (typically from another Rect).
|
||||
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
|
||||
: x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
|
||||
|
||||
// Construct a rect that covers a whole image.
|
||||
template <typename Image>
|
||||
explicit Rect(const Image& image)
|
||||
: Rect(0, 0, image.xsize(), image.ysize()) {}
|
||||
|
||||
Rect() : Rect(0, 0, 0, 0) {}
|
||||
|
||||
Rect(const Rect&) = default;
|
||||
Rect& operator=(const Rect&) = default;
|
||||
|
||||
Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
|
||||
size_t ysize_max) {
|
||||
return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
|
||||
y0_ + ysize_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T* ConstRow(const Image<T>* image, size_t y) const {
|
||||
return image->ConstRow(y + y0_) + x0_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* MutableRow(const Image<T>* image, size_t y) const {
|
||||
return image->MutableRow(y + y0_) + x0_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
|
||||
return image.ConstPlaneRow(c, y + y0_) + x0_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
|
||||
return image->MutablePlaneRow(c, y + y0_) + x0_;
|
||||
}
|
||||
|
||||
// Returns true if this Rect fully resides in the given image. ImageT could be
|
||||
// Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
|
||||
template <class ImageT>
|
||||
bool IsInside(const ImageT& image) const {
|
||||
return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
|
||||
}
|
||||
|
||||
size_t x0() const { return x0_; }
|
||||
size_t y0() const { return y0_; }
|
||||
size_t xsize() const { return xsize_; }
|
||||
size_t ysize() const { return ysize_; }
|
||||
|
||||
private:
|
||||
// Returns size_max, or whatever is left in [begin, end).
|
||||
static constexpr size_t ClampedSize(size_t begin, size_t size_max,
|
||||
size_t end) {
|
||||
return (begin + size_max <= end) ? size_max
|
||||
: (end > begin ? end - begin : 0);
|
||||
}
|
||||
|
||||
size_t x0_;
|
||||
size_t y0_;
|
||||
|
||||
size_t xsize_;
|
||||
size_t ysize_;
|
||||
};
|
||||
|
||||
// Works for any image-like input type(s).
|
||||
template <class Image1, class Image2>
|
||||
HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
|
||||
return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
|
||||
}
|
||||
|
||||
// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
|
||||
// We assume the radius (distance outside the image) is small compared to the
|
||||
// image size, otherwise this might not terminate.
|
||||
// The mirror is outside the last column (border pixel is also replicated).
|
||||
static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
|
||||
const int64_t xsize) {
|
||||
HWY_DASSERT(xsize != 0);
|
||||
|
||||
// TODO(janwas): replace with branchless version
|
||||
while (x < 0 || x >= xsize) {
|
||||
if (x < 0) {
|
||||
x = -x - 1;
|
||||
} else {
|
||||
x = 2 * xsize - 1 - x;
|
||||
}
|
||||
}
|
||||
return static_cast<size_t>(x);
|
||||
}
|
||||
|
||||
// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
|
||||
|
||||
// Mirrors (repeating the edge pixel once). Useful for convolutions.
|
||||
struct WrapMirror {
|
||||
HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
|
||||
return Mirror(coord, static_cast<int64_t>(size));
|
||||
}
|
||||
};
|
||||
|
||||
// Returns the same coordinate, for when we know "coord" is already valid (e.g.
|
||||
// interior of an image).
|
||||
struct WrapUnchanged {
|
||||
HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
|
||||
return static_cast<size_t>(coord);
|
||||
}
|
||||
};
|
||||
|
||||
// Similar to Wrap* but for row pointers (reduces Row() multiplications).
|
||||
|
||||
class WrapRowMirror {
|
||||
public:
|
||||
template <class View>
|
||||
WrapRowMirror(const View& image, size_t ysize)
|
||||
: first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
|
||||
|
||||
const float* operator()(const float* const HWY_RESTRICT row,
|
||||
const int64_t stride) const {
|
||||
if (row < first_row_) {
|
||||
const int64_t num_before = first_row_ - row;
|
||||
// Mirrored; one row before => row 0, two before = row 1, ...
|
||||
return first_row_ + num_before - stride;
|
||||
}
|
||||
if (row > last_row_) {
|
||||
const int64_t num_after = row - last_row_;
|
||||
// Mirrored; one row after => last row, two after = last - 1, ...
|
||||
return last_row_ - num_after + stride;
|
||||
}
|
||||
return row;
|
||||
}
|
||||
|
||||
private:
|
||||
const float* const HWY_RESTRICT first_row_;
|
||||
const float* const HWY_RESTRICT last_row_;
|
||||
};
|
||||
|
||||
struct WrapRowUnchanged {
|
||||
HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
|
||||
int64_t /*stride*/) const {
|
||||
return row;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
|
||||
@@ -0,0 +1,152 @@
|
||||
// Copyright (c) the JPEG XL Project
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/image/image.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <random>
|
||||
#include <utility>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target:
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Ensure we can always write full aligned vectors.
|
||||
struct TestAlignedT {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
std::mt19937 rng(129);
|
||||
std::uniform_int_distribution<int> dist(0, 16);
|
||||
const ScalableTag<T> d;
|
||||
|
||||
for (size_t ysize = 1; ysize < 4; ++ysize) {
|
||||
for (size_t xsize = 1; xsize < 64; ++xsize) {
|
||||
Image<T> img(xsize, ysize);
|
||||
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; x += Lanes(d)) {
|
||||
const auto values = Iota(d, static_cast<T>(dist(rng)));
|
||||
Store(values, d, row + x);
|
||||
}
|
||||
}
|
||||
|
||||
// Sanity check to prevent optimizing out the writes
|
||||
const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng);
|
||||
const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng);
|
||||
HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAligned() { ForUnsignedTypes(TestAlignedT()); }
|
||||
|
||||
// Ensure we can write an unaligned vector starting at the last valid value.
|
||||
struct TestUnalignedT {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
std::mt19937 rng(129);
|
||||
std::uniform_int_distribution<int> dist(0, 3);
|
||||
const ScalableTag<T> d;
|
||||
|
||||
for (size_t ysize = 1; ysize < 4; ++ysize) {
|
||||
for (size_t xsize = 1; xsize < 128; ++xsize) {
|
||||
Image<T> img(xsize, ysize);
|
||||
img.InitializePaddingForUnalignedAccesses();
|
||||
|
||||
// This test reads padding, which only works if it was initialized,
|
||||
// which only happens in MSAN builds.
|
||||
#if HWY_IS_MSAN || HWY_IDE
|
||||
// Initialize only the valid samples
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
row[x] = static_cast<T>(1u << dist(rng));
|
||||
}
|
||||
}
|
||||
|
||||
// Read padding bits
|
||||
auto accum = Zero(d);
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
accum = Or(accum, LoadU(d, row + x));
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure padding was zero
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(accum, d, lanes.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT(lanes[i] < 16);
|
||||
}
|
||||
#else // Check that writing padding does not overwrite valid samples
|
||||
// Initialize only the valid samples
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
row[x] = static_cast<T>(x);
|
||||
}
|
||||
}
|
||||
|
||||
// Zero padding and rightmost sample
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
StoreU(Zero(d), d, row + xsize - 1);
|
||||
}
|
||||
|
||||
// Ensure no samples except the rightmost were overwritten
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize - 1; ++x) {
|
||||
HWY_ASSERT_EQ(static_cast<T>(x), row[x]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(ImageTest);
|
||||
HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
|
||||
HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,227 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __STDC_FORMAT_MACROS
|
||||
#define __STDC_FORMAT_MACROS // before inttypes.h
|
||||
#endif
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cfloat> // FLT_MAX
|
||||
#include <type_traits>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/math/math-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <class Out, class In>
|
||||
inline Out BitCast(const In& in) {
|
||||
static_assert(sizeof(Out) == sizeof(In), "");
|
||||
Out out;
|
||||
CopyBytes<sizeof(out)>(&in, &out);
|
||||
return out;
|
||||
}
|
||||
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
|
||||
Vec<D> (*fxN)(D, VecArg<Vec<D>>), D d, T min, T max,
|
||||
uint64_t max_error_ulp) {
|
||||
using UintT = MakeUnsigned<T>;
|
||||
|
||||
const UintT min_bits = BitCast<UintT>(min);
|
||||
const UintT max_bits = BitCast<UintT>(max);
|
||||
|
||||
// If min is negative and max is positive, the range needs to be broken into
|
||||
// two pieces, [+0, max] and [-0, min], otherwise [min, max].
|
||||
int range_count = 1;
|
||||
UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
|
||||
if ((min < 0.0) && (max > 0.0)) {
|
||||
ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0));
|
||||
ranges[0][1] = max_bits;
|
||||
ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0));
|
||||
ranges[1][1] = min_bits;
|
||||
range_count = 2;
|
||||
}
|
||||
|
||||
uint64_t max_ulp = 0;
|
||||
// Emulation is slower, so cannot afford as many.
|
||||
constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000));
|
||||
for (int range_index = 0; range_index < range_count; ++range_index) {
|
||||
const UintT start = ranges[range_index][0];
|
||||
const UintT stop = ranges[range_index][1];
|
||||
const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange));
|
||||
for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
|
||||
// For reasons unknown, the HWY_MAX is necessary on RVV, otherwise
|
||||
// value_bits can be less than start, and thus possibly NaN.
|
||||
const T value = BitCast<T>(HWY_MIN(HWY_MAX(start, value_bits), stop));
|
||||
const T actual = GetLane(fxN(d, Set(d, value)));
|
||||
const T expected = fx1(value);
|
||||
|
||||
// Skip small inputs and outputs on armv7, it flushes subnormals to zero.
|
||||
#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
|
||||
if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected);
|
||||
max_ulp = HWY_MAX(max_ulp, ulp);
|
||||
if (ulp > max_error_ulp) {
|
||||
fprintf(stderr,
|
||||
"%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value,
|
||||
expected, actual, static_cast<uint64_t>(ulp),
|
||||
static_cast<uint32_t>(max_error_ulp));
|
||||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp);
|
||||
HWY_ASSERT(max_ulp <= max_error_ulp);
|
||||
}
|
||||
|
||||
#define DEFINE_MATH_TEST_FUNC(NAME) \
|
||||
HWY_NOINLINE void TestAll##NAME() { \
|
||||
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
|
||||
}
|
||||
|
||||
#undef DEFINE_MATH_TEST
|
||||
#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
|
||||
F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \
|
||||
struct Test##NAME { \
|
||||
template <class T, class D> \
|
||||
HWY_NOINLINE void operator()(T, D d) { \
|
||||
if (sizeof(T) == 4) { \
|
||||
TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \
|
||||
F32_ERROR); \
|
||||
} else { \
|
||||
TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, \
|
||||
static_cast<T>(F64_MIN), static_cast<T>(F64_MAX), \
|
||||
F64_ERROR); \
|
||||
} \
|
||||
} \
|
||||
}; \
|
||||
DEFINE_MATH_TEST_FUNC(NAME)
|
||||
|
||||
// Floating point values closest to but less than 1.0
|
||||
const float kNearOneF = BitCast<float>(0x3F7FFFFF);
|
||||
const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
|
||||
|
||||
// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so
|
||||
// only increase the error tolerance there.
|
||||
constexpr uint64_t Cos64ULP() {
|
||||
#if defined(__MINGW32__)
|
||||
return 23;
|
||||
#else
|
||||
return 3;
|
||||
#endif
|
||||
}
|
||||
|
||||
constexpr uint64_t ACosh32ULP() {
|
||||
#if defined(__MINGW32__)
|
||||
return 8;
|
||||
#else
|
||||
return 3;
|
||||
#endif
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
DEFINE_MATH_TEST(Acos,
|
||||
std::acos, CallAcos, -1.0f, +1.0f, 3, // NEON is 3 instead of 2
|
||||
std::acos, CallAcos, -1.0, +1.0, 2)
|
||||
DEFINE_MATH_TEST(Acosh,
|
||||
std::acosh, CallAcosh, +1.0f, +FLT_MAX, ACosh32ULP(),
|
||||
std::acosh, CallAcosh, +1.0, +DBL_MAX, 3)
|
||||
DEFINE_MATH_TEST(Asin,
|
||||
std::asin, CallAsin, -1.0f, +1.0f, 4, // ARMv7 is 4 instead of 2
|
||||
std::asin, CallAsin, -1.0, +1.0, 2)
|
||||
DEFINE_MATH_TEST(Asinh,
|
||||
std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3,
|
||||
std::asinh, CallAsinh, -DBL_MAX, +DBL_MAX, 3)
|
||||
DEFINE_MATH_TEST(Atan,
|
||||
std::atan, CallAtan, -FLT_MAX, +FLT_MAX, 3,
|
||||
std::atan, CallAtan, -DBL_MAX, +DBL_MAX, 3)
|
||||
DEFINE_MATH_TEST(Atanh,
|
||||
std::atanh, CallAtanh, -kNearOneF, +kNearOneF, 4, // NEON is 4 instead of 3
|
||||
std::atanh, CallAtanh, -kNearOneD, +kNearOneD, 3)
|
||||
DEFINE_MATH_TEST(Cos,
|
||||
std::cos, CallCos, -39000.0f, +39000.0f, 3,
|
||||
std::cos, CallCos, -39000.0, +39000.0, Cos64ULP())
|
||||
DEFINE_MATH_TEST(Exp,
|
||||
std::exp, CallExp, -FLT_MAX, +104.0f, 1,
|
||||
std::exp, CallExp, -DBL_MAX, +104.0, 1)
|
||||
DEFINE_MATH_TEST(Expm1,
|
||||
std::expm1, CallExpm1, -FLT_MAX, +104.0f, 4,
|
||||
std::expm1, CallExpm1, -DBL_MAX, +104.0, 4)
|
||||
DEFINE_MATH_TEST(Log,
|
||||
std::log, CallLog, +FLT_MIN, +FLT_MAX, 1,
|
||||
std::log, CallLog, +DBL_MIN, +DBL_MAX, 1)
|
||||
DEFINE_MATH_TEST(Log10,
|
||||
std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2,
|
||||
std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Log1p,
|
||||
std::log1p, CallLog1p, +0.0f, +1e37f, 3, // NEON is 3 instead of 2
|
||||
std::log1p, CallLog1p, +0.0, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Log2,
|
||||
std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2,
|
||||
std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Sin,
|
||||
std::sin, CallSin, -39000.0f, +39000.0f, 3,
|
||||
std::sin, CallSin, -39000.0, +39000.0, 4) // MSYS is 4 instead of 3
|
||||
DEFINE_MATH_TEST(Sinh,
|
||||
std::sinh, CallSinh, -80.0f, +80.0f, 4,
|
||||
std::sinh, CallSinh, -709.0, +709.0, 4)
|
||||
DEFINE_MATH_TEST(Tanh,
|
||||
std::tanh, CallTanh, -FLT_MAX, +FLT_MAX, 4,
|
||||
std::tanh, CallTanh, -DBL_MAX, +DBL_MAX, 4)
|
||||
// clang-format on
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyMathTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,190 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
|
||||
# usages with an empty list.
|
||||
COMPAT = [
|
||||
"//buildenv/target:non_prod", # includes mobile/vendor.
|
||||
]
|
||||
|
||||
# cc_library(
|
||||
# name = "vxsort",
|
||||
# srcs = [
|
||||
# "vxsort/isa_detection.cpp",
|
||||
# "vxsort/isa_detection_msvc.cpp",
|
||||
# "vxsort/isa_detection_sane.cpp",
|
||||
# "vxsort/machine_traits.avx2.cpp",
|
||||
# "vxsort/smallsort/avx2_load_mask_tables.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
|
||||
# "vxsort/vxsort_stats.cpp",
|
||||
# ],
|
||||
# hdrs = [
|
||||
# "vxsort/alignment.h",
|
||||
# "vxsort/defs.h",
|
||||
# "vxsort/isa_detection.h",
|
||||
# "vxsort/machine_traits.avx2.h",
|
||||
# "vxsort/machine_traits.avx512.h",
|
||||
# "vxsort/machine_traits.h",
|
||||
# "vxsort/packer.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.h",
|
||||
# "vxsort/vxsort.h",
|
||||
# "vxsort/vxsort_stats.h",
|
||||
# ],
|
||||
# compatible_with = [],
|
||||
# textual_hdrs = [
|
||||
# "vxsort/vxsort_targets_disable.h",
|
||||
# "vxsort/vxsort_targets_enable_avx2.h",
|
||||
# "vxsort/vxsort_targets_enable_avx512.h",
|
||||
# ],
|
||||
# )
|
||||
|
||||
cc_library(
|
||||
name = "vqsort",
|
||||
srcs = [
|
||||
# Split into separate files to reduce MSVC build time.
|
||||
"vqsort.cc",
|
||||
"vqsort_128a.cc",
|
||||
"vqsort_128d.cc",
|
||||
"vqsort_f32a.cc",
|
||||
"vqsort_f32d.cc",
|
||||
"vqsort_f64a.cc",
|
||||
"vqsort_f64d.cc",
|
||||
"vqsort_i16a.cc",
|
||||
"vqsort_i16d.cc",
|
||||
"vqsort_i32a.cc",
|
||||
"vqsort_i32d.cc",
|
||||
"vqsort_i64a.cc",
|
||||
"vqsort_i64d.cc",
|
||||
"vqsort_kv64a.cc",
|
||||
"vqsort_kv64d.cc",
|
||||
"vqsort_kv128a.cc",
|
||||
"vqsort_kv128d.cc",
|
||||
"vqsort_u16a.cc",
|
||||
"vqsort_u16d.cc",
|
||||
"vqsort_u32a.cc",
|
||||
"vqsort_u32d.cc",
|
||||
"vqsort_u64a.cc",
|
||||
"vqsort_u64d.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"vqsort.h", # public interface
|
||||
],
|
||||
compatible_with = [],
|
||||
local_defines = ["hwy_contrib_EXPORTS"],
|
||||
textual_hdrs = [
|
||||
"shared-inl.h",
|
||||
"sorting_networks-inl.h",
|
||||
"traits-inl.h",
|
||||
"traits128-inl.h",
|
||||
"vqsort-inl.h",
|
||||
# Placeholder for internal instrumentation. Do not remove.
|
||||
],
|
||||
deps = [
|
||||
# Only if VQSORT_SECURE_RNG is set.
|
||||
# "//third_party/absl/random",
|
||||
"//:hwy",
|
||||
# ":vxsort", # required if HAVE_VXSORT
|
||||
],
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Internal-only targets
|
||||
|
||||
cc_library(
|
||||
name = "helpers",
|
||||
testonly = 1,
|
||||
textual_hdrs = [
|
||||
"algo-inl.h",
|
||||
"result-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":vqsort",
|
||||
"//:nanobenchmark",
|
||||
# Required for HAVE_PDQSORT, but that is unused and this is
|
||||
# unavailable to Bazel builds, hence commented out.
|
||||
# "//third_party/boost/allowed",
|
||||
# Avoid ips4o and thus TBB to work around hwloc build failure.
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "print_network",
|
||||
testonly = 1,
|
||||
srcs = ["print_network.cc"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"//:hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "sort_test",
|
||||
size = "medium",
|
||||
srcs = ["sort_test.cc"],
|
||||
# Do not enable fully_static_link (pthread crash on bazel)
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
# for test_suite.
|
||||
tags = ["hwy_ops_test"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
"//:hwy",
|
||||
"//:hwy_test_util",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "bench_sort",
|
||||
testonly = 1,
|
||||
srcs = ["bench_sort.cc"],
|
||||
# Do not enable fully_static_link (pthread crash on bazel)
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
"//:hwy",
|
||||
"//:hwy_test_util",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "bench_parallel",
|
||||
testonly = 1,
|
||||
srcs = ["bench_parallel.cc"],
|
||||
# Do not enable fully_static_link (pthread crash on bazel)
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
"//:hwy",
|
||||
"//:hwy_test_util",
|
||||
],
|
||||
)
|
||||
@@ -0,0 +1,87 @@
|
||||
# Vectorized and performance-portable Quicksort
|
||||
|
||||
## Introduction
|
||||
|
||||
As of 2022-06-07 this sorts large arrays of built-in types about ten times as
|
||||
fast as `std::sort`. See also our
|
||||
[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
|
||||
and [paper](https://arxiv.org/abs/2205.05982).
|
||||
|
||||
## Instructions
|
||||
|
||||
Here are instructions for reproducing our results on Linux and AWS (SVE, NEON).
|
||||
|
||||
### Linux
|
||||
|
||||
Please first ensure golang, and Clang (tested with 13.0.1) are installed via
|
||||
your system's package manager.
|
||||
|
||||
```
|
||||
go install github.com/bazelbuild/bazelisk@latest
|
||||
git clone https://github.com/google/highway
|
||||
cd highway
|
||||
CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
|
||||
bazel-bin/hwy/contrib/sort/sort_test
|
||||
bazel-bin/hwy/contrib/sort/bench_sort
|
||||
```
|
||||
|
||||
### AWS Graviton3
|
||||
|
||||
Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
|
||||
32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
|
||||
config is verified, then re-launch. See IPv4 hostname in list of instances.
|
||||
|
||||
`ssh -i /path/key.pem ec2-user@hostname`
|
||||
|
||||
Note that the AWS CMake package is too old for llvm, so we build it first:
|
||||
```
|
||||
wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
|
||||
tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
|
||||
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
|
||||
make -j8 && sudo make install
|
||||
cd ..
|
||||
```
|
||||
|
||||
AWS clang is at version 11.1, which generates unnecessary `AND` instructions
|
||||
which slow down the sort by 1.15x. We tested with clang trunk as of June 13
|
||||
(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
|
||||
|
||||
```
|
||||
git clone --depth 1 https://github.com/llvm/llvm-project.git
|
||||
cd llvm-project
|
||||
mkdir -p build && cd build
|
||||
/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
|
||||
make -j32 && sudo make install
|
||||
```
|
||||
|
||||
```
|
||||
sudo yum install go
|
||||
go install github.com/bazelbuild/bazelisk@latest
|
||||
git clone https://github.com/google/highway
|
||||
cd highway
|
||||
CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
|
||||
bazel-bin/hwy/contrib/sort/sort_test
|
||||
bazel-bin/hwy/contrib/sort/bench_sort
|
||||
```
|
||||
|
||||
The above command line enables SVE, which is currently only available on
|
||||
Graviton 3. You can also test NEON on the same processor, or other Arm CPUs, by
|
||||
changing the `-march=` option to `--copt=-march=armv8.2-a+crypto`. Note that
|
||||
such flags will be unnecessary once Clang supports `#pragma target` for NEON and
|
||||
SVE intrinsics, as it does for x86.
|
||||
|
||||
## Results
|
||||
|
||||
`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
|
||||
algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
|
||||
sorted (f32 is float), the distribution of keys (uniform32 for uniform random
|
||||
with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
|
||||
number of key bytes output per second).
|
||||
|
||||
Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
|
||||
|
||||
```
|
||||
[ RUN ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
|
||||
AVX3: std: f32: uniform32: 1.00E+06 54 MB/s ( 1 threads)
|
||||
AVX3: vq: f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
|
||||
```
|
||||
@@ -0,0 +1,512 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Normal include guard for target-independent parts
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath> // std::abs
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
// Third-party algorithms
|
||||
#define HAVE_AVX2SORT 0
|
||||
#define HAVE_IPS4O 0
|
||||
// When enabling, consider changing max_threads (required for Table 1a)
|
||||
#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
|
||||
#define HAVE_PDQSORT 0
|
||||
#define HAVE_SORT512 0
|
||||
#define HAVE_VXSORT 0
|
||||
|
||||
#if HAVE_AVX2SORT
|
||||
HWY_PUSH_ATTRIBUTES("avx2,avx")
|
||||
#include "avx2sort.h" //NOLINT
|
||||
HWY_POP_ATTRIBUTES
|
||||
#endif
|
||||
#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
|
||||
#include "third_party/ips4o/include/ips4o.hpp"
|
||||
#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
#include "third_party/boost/allowed/sort/sort.hpp"
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
#include "sort512.h" //NOLINT
|
||||
#endif
|
||||
|
||||
// vxsort is difficult to compile for multiple targets because it also uses
|
||||
// .cpp files, and we'd also have to #undef its include guards. Instead, compile
|
||||
// only for AVX2 or AVX3 depending on this macro.
|
||||
#define VXSORT_AVX3 1
|
||||
#if HAVE_VXSORT
|
||||
// inlined from vxsort_targets_enable_avx512 (must close before end of header)
|
||||
#ifdef __GNUC__
|
||||
#ifdef __clang__
|
||||
#if VXSORT_AVX3
|
||||
#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
|
||||
apply_to = any(function))
|
||||
#else
|
||||
#pragma clang attribute push(__attribute__((target("avx2"))), \
|
||||
apply_to = any(function))
|
||||
#endif // VXSORT_AVX3
|
||||
|
||||
#else
|
||||
#pragma GCC push_options
|
||||
#if VXSORT_AVX3
|
||||
#pragma GCC target("avx512f,avx512dq")
|
||||
#else
|
||||
#pragma GCC target("avx2")
|
||||
#endif // VXSORT_AVX3
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if VXSORT_AVX3
|
||||
#include "vxsort/machine_traits.avx512.h"
|
||||
#else
|
||||
#include "vxsort/machine_traits.avx2.h"
|
||||
#endif // VXSORT_AVX3
|
||||
#include "vxsort/vxsort.h"
|
||||
#ifdef __GNUC__
|
||||
#ifdef __clang__
|
||||
#pragma clang attribute pop
|
||||
#else
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif // HAVE_VXSORT
|
||||
|
||||
namespace hwy {
|
||||
|
||||
enum class Dist { kUniform8, kUniform16, kUniform32 };
|
||||
|
||||
static inline std::vector<Dist> AllDist() {
|
||||
return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
|
||||
}
|
||||
|
||||
static inline const char* DistName(Dist dist) {
|
||||
switch (dist) {
|
||||
case Dist::kUniform8:
|
||||
return "uniform8";
|
||||
case Dist::kUniform16:
|
||||
return "uniform16";
|
||||
case Dist::kUniform32:
|
||||
return "uniform32";
|
||||
}
|
||||
return "unreachable";
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class InputStats {
|
||||
public:
|
||||
void Notify(T value) {
|
||||
min_ = std::min(min_, value);
|
||||
max_ = std::max(max_, value);
|
||||
// Converting to integer would truncate floats, multiplying to save digits
|
||||
// risks overflow especially when casting, so instead take the sum of the
|
||||
// bit representations as the checksum.
|
||||
uint64_t bits = 0;
|
||||
static_assert(sizeof(T) <= 8, "Expected a built-in type");
|
||||
CopyBytes<sizeof(T)>(&value, &bits); // not same size
|
||||
sum_ += bits;
|
||||
count_ += 1;
|
||||
}
|
||||
|
||||
bool operator==(const InputStats& other) const {
|
||||
if (count_ != other.count_) {
|
||||
HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
|
||||
static_cast<int>(other.count_));
|
||||
}
|
||||
|
||||
if (min_ != other.min_ || max_ != other.max_) {
|
||||
HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
|
||||
static_cast<double>(max_), static_cast<double>(other.min_),
|
||||
static_cast<double>(other.max_));
|
||||
}
|
||||
|
||||
// Sum helps detect duplicated/lost values
|
||||
if (sum_ != other.sum_) {
|
||||
HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
|
||||
static_cast<double>(sum_), static_cast<double>(other.sum_),
|
||||
static_cast<double>(min_), static_cast<double>(max_));
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
T min_ = hwy::HighestValue<T>();
|
||||
T max_ = hwy::LowestValue<T>();
|
||||
uint64_t sum_ = 0;
|
||||
size_t count_ = 0;
|
||||
};
|
||||
|
||||
enum class Algo {
|
||||
#if HAVE_AVX2SORT
|
||||
kSEA,
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
kIPS4O,
|
||||
#endif
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
kParallelIPS4O,
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
kPDQ,
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
kSort512,
|
||||
#endif
|
||||
#if HAVE_VXSORT
|
||||
kVXSort,
|
||||
#endif
|
||||
kStd,
|
||||
kVQSort,
|
||||
kHeap,
|
||||
};
|
||||
|
||||
static inline const char* AlgoName(Algo algo) {
|
||||
switch (algo) {
|
||||
#if HAVE_AVX2SORT
|
||||
case Algo::kSEA:
|
||||
return "sea";
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
case Algo::kIPS4O:
|
||||
return "ips4o";
|
||||
#endif
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
case Algo::kParallelIPS4O:
|
||||
return "par_ips4o";
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
case Algo::kPDQ:
|
||||
return "pdq";
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
case Algo::kSort512:
|
||||
return "sort512";
|
||||
#endif
|
||||
#if HAVE_VXSORT
|
||||
case Algo::kVXSort:
|
||||
return "vxsort";
|
||||
#endif
|
||||
case Algo::kStd:
|
||||
return "std";
|
||||
case Algo::kVQSort:
|
||||
return "vq";
|
||||
case Algo::kHeap:
|
||||
return "heap";
|
||||
}
|
||||
return "unreachable";
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
class Xorshift128Plus {
|
||||
static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
|
||||
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
|
||||
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
|
||||
return z ^ (z >> 31);
|
||||
}
|
||||
|
||||
public:
|
||||
// Generates two vectors of 64-bit seeds via SplitMix64 and stores into
|
||||
// `seeds`. Generating these afresh in each ChoosePivot is too expensive.
|
||||
template <class DU64>
|
||||
static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
|
||||
seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
|
||||
for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
|
||||
seeds[i] = SplitMix64(seeds[i - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Need to pass in the state because vector cannot be class members.
|
||||
template <class VU64>
|
||||
static VU64 RandomBits(VU64& state0, VU64& state1) {
|
||||
VU64 s1 = state0;
|
||||
VU64 s0 = state1;
|
||||
const VU64 bits = Add(s1, s0);
|
||||
state0 = s0;
|
||||
s1 = Xor(s1, ShiftLeft<23>(s1));
|
||||
state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
|
||||
return bits;
|
||||
}
|
||||
};
|
||||
|
||||
template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
|
||||
Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
|
||||
const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
|
||||
return BitCast(d, And(bits, mask));
|
||||
}
|
||||
|
||||
// It is important to avoid denormals, which are flushed to zero by SIMD but not
|
||||
// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
|
||||
template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
|
||||
Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
|
||||
using TF = TFromD<DF>;
|
||||
const RebindToUnsigned<decltype(df)> du;
|
||||
using VU = Vec<decltype(du)>;
|
||||
|
||||
const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types
|
||||
using TU = MakeUnsigned<TF>;
|
||||
const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
|
||||
#else
|
||||
const VU bits = BitCast(du, bits64);
|
||||
#endif
|
||||
// Avoid NaN/denormal by only generating values in [1, 2), i.e. random
|
||||
// mantissas with the exponent taken from the representation of 1.0.
|
||||
const VU k1 = BitCast(du, Set(df, TF{1.0}));
|
||||
const VU mantissa_mask = Set(du, MantissaMask<TF>());
|
||||
const VU representation = OrAnd(k1, bits, mantissa_mask);
|
||||
return BitCast(df, representation);
|
||||
}
|
||||
|
||||
template <class DU64>
|
||||
Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
|
||||
switch (sizeof_t) {
|
||||
case 2:
|
||||
return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
|
||||
: 0xFFFFFFFFFFFFFFFFull);
|
||||
case 4:
|
||||
return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull
|
||||
: (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
|
||||
: 0xFFFFFFFFFFFFFFFFull);
|
||||
case 8:
|
||||
return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull
|
||||
: (dist == Dist::kUniform16) ? 0x000000000000FFFFull
|
||||
: 0x00000000FFFFFFFFull);
|
||||
default:
|
||||
HWY_ABORT("Logic error");
|
||||
return Zero(du64);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
|
||||
SortTag<uint64_t> du64;
|
||||
using VU64 = Vec<decltype(du64)>;
|
||||
const size_t N64 = Lanes(du64);
|
||||
auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
|
||||
Xorshift128Plus::GenerateSeeds(du64, seeds.get());
|
||||
VU64 s0 = Load(du64, seeds.get());
|
||||
VU64 s1 = Load(du64, seeds.get() + N64);
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
const Sisd<T> d;
|
||||
#else
|
||||
const Repartition<T, decltype(du64)> d;
|
||||
#endif
|
||||
using V = Vec<decltype(d)>;
|
||||
const size_t N = Lanes(d);
|
||||
const VU64 mask = MaskForDist(du64, dist, sizeof(T));
|
||||
auto buf = hwy::AllocateAligned<T>(N);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= num; i += N) {
|
||||
const V values = RandomValues(d, s0, s1, mask);
|
||||
StoreU(values, d, v + i);
|
||||
}
|
||||
if (i < num) {
|
||||
const V values = RandomValues(d, s0, s1, mask);
|
||||
StoreU(values, d, buf.get());
|
||||
memcpy(v + i, buf.get(), (num - i) * sizeof(T));
|
||||
}
|
||||
|
||||
InputStats<T> input_stats;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
input_stats.Notify(v[i]);
|
||||
}
|
||||
return input_stats;
|
||||
}
|
||||
|
||||
struct ThreadLocal {
|
||||
Sorter sorter;
|
||||
};
|
||||
|
||||
struct SharedState {
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
const unsigned max_threads = hwy::LimitsMax<unsigned>(); // 16 for Table 1a
|
||||
ips4o::StdThreadPool pool{static_cast<int>(
|
||||
HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
|
||||
#endif
|
||||
std::vector<ThreadLocal> tls{1};
|
||||
};
|
||||
|
||||
// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
|
||||
// non-128-bit keys they are the same:
|
||||
template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
|
||||
void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
|
||||
using detail::TraitsLane;
|
||||
using detail::SharedTraits;
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
|
||||
return detail::HeapSort(st, keys, num_keys);
|
||||
} else {
|
||||
const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
|
||||
return detail::HeapSort(st, keys, num_keys);
|
||||
}
|
||||
}
|
||||
|
||||
#if VQSORT_ENABLED
|
||||
template <class Order>
|
||||
void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
|
||||
using detail::SharedTraits;
|
||||
using detail::Traits128;
|
||||
uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
|
||||
const size_t num_lanes = num_keys * 2;
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<Traits128<detail::OrderAscending128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
} else {
|
||||
const SharedTraits<Traits128<detail::OrderDescending128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Order>
|
||||
void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
|
||||
using detail::SharedTraits;
|
||||
using detail::Traits128;
|
||||
uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
|
||||
const size_t num_lanes = num_keys * 2;
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
} else {
|
||||
const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
}
|
||||
}
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
template <class Order, typename KeyType>
|
||||
void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
|
||||
SharedState& shared, size_t thread) {
|
||||
const std::less<KeyType> less;
|
||||
const std::greater<KeyType> greater;
|
||||
|
||||
switch (algo) {
|
||||
#if HAVE_AVX2SORT
|
||||
case Algo::kSEA:
|
||||
return avx2::quicksort(inout, static_cast<int>(num));
|
||||
#endif
|
||||
|
||||
#if HAVE_IPS4O
|
||||
case Algo::kIPS4O:
|
||||
if (Order().IsAscending()) {
|
||||
return ips4o::sort(inout, inout + num, less);
|
||||
} else {
|
||||
return ips4o::sort(inout, inout + num, greater);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
case Algo::kParallelIPS4O:
|
||||
if (Order().IsAscending()) {
|
||||
return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
|
||||
} else {
|
||||
return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_SORT512
|
||||
case Algo::kSort512:
|
||||
HWY_ABORT("not supported");
|
||||
// return Sort512::Sort(inout, num);
|
||||
#endif
|
||||
|
||||
#if HAVE_PDQSORT
|
||||
case Algo::kPDQ:
|
||||
if (Order().IsAscending()) {
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num, less);
|
||||
} else {
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num, greater);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_VXSORT
|
||||
case Algo::kVXSort: {
|
||||
#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
|
||||
(!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
|
||||
fprintf(stderr, "Do not call for target %s\n",
|
||||
hwy::TargetName(HWY_TARGET));
|
||||
return;
|
||||
#else
|
||||
#if VXSORT_AVX3
|
||||
vxsort::vxsort<KeyType, vxsort::AVX512> vx;
|
||||
#else
|
||||
vxsort::vxsort<KeyType, vxsort::AVX2> vx;
|
||||
#endif
|
||||
if (Order().IsAscending()) {
|
||||
return vx.sort(inout, inout + num - 1);
|
||||
} else {
|
||||
fprintf(stderr, "Skipping VX - does not support descending order\n");
|
||||
return;
|
||||
}
|
||||
#endif // enabled for this target
|
||||
}
|
||||
#endif // HAVE_VXSORT
|
||||
|
||||
case Algo::kStd:
|
||||
if (Order().IsAscending()) {
|
||||
return std::sort(inout, inout + num, less);
|
||||
} else {
|
||||
return std::sort(inout, inout + num, greater);
|
||||
}
|
||||
|
||||
case Algo::kVQSort:
|
||||
return shared.tls[thread].sorter(inout, num, Order());
|
||||
|
||||
case Algo::kHeap:
|
||||
return CallHeapSort<Order>(inout, num);
|
||||
|
||||
default:
|
||||
HWY_ABORT("Not implemented");
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
||||
@@ -0,0 +1,238 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Concurrent, independent sorts for generating more memory traffic and testing
|
||||
// scalability.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <condition_variable> //NOLINT
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <mutex> //NOLINT
|
||||
#include <thread> //NOLINT
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" //NOLINT
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/aligned_allocator.h"
|
||||
// Last
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
|
||||
class ThreadPool {
|
||||
public:
|
||||
// Starts the given number of worker threads and blocks until they are ready.
|
||||
explicit ThreadPool(
|
||||
const size_t num_threads = std::thread::hardware_concurrency())
|
||||
: num_threads_(num_threads) {
|
||||
HWY_ASSERT(num_threads_ > 0);
|
||||
threads_.reserve(num_threads_);
|
||||
for (size_t i = 0; i < num_threads_; ++i) {
|
||||
threads_.emplace_back(ThreadFunc, this, i);
|
||||
}
|
||||
|
||||
WorkersReadyBarrier();
|
||||
}
|
||||
|
||||
ThreadPool(const ThreadPool&) = delete;
|
||||
ThreadPool& operator&(const ThreadPool&) = delete;
|
||||
|
||||
// Waits for all threads to exit.
|
||||
~ThreadPool() {
|
||||
StartWorkers(kWorkerExit);
|
||||
|
||||
for (std::thread& thread : threads_) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
|
||||
size_t NumThreads() const { return threads_.size(); }
|
||||
|
||||
template <class Func>
|
||||
void RunOnThreads(size_t max_threads, const Func& func) {
|
||||
task_ = &CallClosure<Func>;
|
||||
data_ = &func;
|
||||
StartWorkers(max_threads);
|
||||
WorkersReadyBarrier();
|
||||
}
|
||||
|
||||
private:
|
||||
// After construction and between calls to Run, workers are "ready", i.e.
|
||||
// waiting on worker_start_cv_. They are "started" by sending a "command"
|
||||
// and notifying all worker_start_cv_ waiters. (That is why all workers
|
||||
// must be ready/waiting - otherwise, the notification will not reach all of
|
||||
// them and the main thread waits in vain for them to report readiness.)
|
||||
using WorkerCommand = uint64_t;
|
||||
|
||||
static constexpr WorkerCommand kWorkerWait = ~1ULL;
|
||||
static constexpr WorkerCommand kWorkerExit = ~2ULL;
|
||||
|
||||
// Calls a closure (lambda with captures).
|
||||
template <class Closure>
|
||||
static void CallClosure(const void* f, size_t thread) {
|
||||
(*reinterpret_cast<const Closure*>(f))(thread);
|
||||
}
|
||||
|
||||
void WorkersReadyBarrier() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
// Typically only a single iteration.
|
||||
while (workers_ready_ != threads_.size()) {
|
||||
workers_ready_cv_.wait(lock);
|
||||
}
|
||||
workers_ready_ = 0;
|
||||
|
||||
// Safely handle spurious worker wakeups.
|
||||
worker_start_command_ = kWorkerWait;
|
||||
}
|
||||
|
||||
// Precondition: all workers are ready.
|
||||
void StartWorkers(const WorkerCommand worker_command) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
worker_start_command_ = worker_command;
|
||||
// Workers will need this lock, so release it before they wake up.
|
||||
lock.unlock();
|
||||
worker_start_cv_.notify_all();
|
||||
}
|
||||
|
||||
static void ThreadFunc(ThreadPool* self, size_t thread) {
|
||||
// Until kWorkerExit command received:
|
||||
for (;;) {
|
||||
std::unique_lock<std::mutex> lock(self->mutex_);
|
||||
// Notify main thread that this thread is ready.
|
||||
if (++self->workers_ready_ == self->num_threads_) {
|
||||
self->workers_ready_cv_.notify_one();
|
||||
}
|
||||
RESUME_WAIT:
|
||||
// Wait for a command.
|
||||
self->worker_start_cv_.wait(lock);
|
||||
const WorkerCommand command = self->worker_start_command_;
|
||||
switch (command) {
|
||||
case kWorkerWait: // spurious wakeup:
|
||||
goto RESUME_WAIT; // lock still held, avoid incrementing ready.
|
||||
case kWorkerExit:
|
||||
return; // exits thread
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lock.unlock();
|
||||
// Command is the maximum number of threads that should run the task.
|
||||
HWY_ASSERT(command < self->NumThreads());
|
||||
if (thread < command) {
|
||||
self->task_(self->data_, thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const size_t num_threads_;
|
||||
|
||||
// Unmodified after ctor, but cannot be const because we call thread::join().
|
||||
std::vector<std::thread> threads_;
|
||||
|
||||
std::mutex mutex_; // guards both cv and their variables.
|
||||
std::condition_variable workers_ready_cv_;
|
||||
size_t workers_ready_ = 0;
|
||||
std::condition_variable worker_start_cv_;
|
||||
WorkerCommand worker_start_command_;
|
||||
|
||||
// Written by main thread, read by workers (after mutex lock/unlock).
|
||||
std::function<void(const void*, size_t)> task_; // points to CallClosure
|
||||
const void* data_; // points to caller's Func
|
||||
};
|
||||
|
||||
template <class Traits>
|
||||
void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
|
||||
const Algo algo, SharedState& shared, size_t thread) {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
using Order = typename Traits::Order;
|
||||
const size_t num_lanes = num_keys * st.LanesPerKey();
|
||||
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
|
||||
(void)GenerateInput(dist, aligned.get(), num_lanes);
|
||||
|
||||
const Timestamp t0;
|
||||
Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
|
||||
thread);
|
||||
HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
|
||||
}
|
||||
|
||||
void BenchParallel() {
|
||||
// Not interested in benchmark results for other targets on x86
|
||||
if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ThreadPool pool;
|
||||
const size_t NT = pool.NumThreads();
|
||||
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
|
||||
using KeyType = typename decltype(st)::KeyType;
|
||||
const size_t num_keys = size_t{100} * 1000 * 1000;
|
||||
|
||||
#if HAVE_IPS4O
|
||||
const Algo algo = Algo::kIPS4O;
|
||||
#else
|
||||
const Algo algo = Algo::kVQSort;
|
||||
#endif
|
||||
const Dist dist = Dist::kUniform32;
|
||||
|
||||
SharedState shared;
|
||||
shared.tls.resize(NT);
|
||||
|
||||
std::vector<Result> results;
|
||||
for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
|
||||
Timestamp t0;
|
||||
// Default capture because MSVC wants algo/dist but clang does not.
|
||||
pool.RunOnThreads(nt, [=, &shared](size_t thread) {
|
||||
RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
|
||||
});
|
||||
const double sec = SecondsSince(t0);
|
||||
results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
|
||||
st.KeyString());
|
||||
results.back().Print();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_BEFORE_TEST(BenchParallel);
|
||||
HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,310 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// Mode for larger sorts because M1 is able to access more than the per-core
|
||||
// share of L2, so 1M elements might still be in cache.
|
||||
#define SORT_100M 0
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
// Defined within HWY_ONCE, used by BenchAllSort.
|
||||
extern int64_t first_sort_target;
|
||||
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
using detail::TraitsLane;
|
||||
using detail::OrderAscending;
|
||||
using detail::OrderDescending;
|
||||
using detail::SharedTraits;
|
||||
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderAscendingKV128;
|
||||
using detail::Traits128;
|
||||
|
||||
template <class Traits>
|
||||
HWY_NOINLINE void BenchPartition() {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
const SortTag<LaneType> d;
|
||||
detail::SharedTraits<Traits> st;
|
||||
const Dist dist = Dist::kUniform8;
|
||||
double sum = 0.0;
|
||||
|
||||
detail::Generator rng(&sum, 123); // for ChoosePivot
|
||||
|
||||
const size_t max_log2 = AdjustedLog2Reps(20);
|
||||
for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
|
||||
const size_t num_lanes = 1ull << log2;
|
||||
const size_t num_keys = num_lanes / st.LanesPerKey();
|
||||
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(
|
||||
HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
|
||||
hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));
|
||||
|
||||
std::vector<double> seconds;
|
||||
const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
|
||||
for (size_t rep = 0; rep < num_reps; ++rep) {
|
||||
(void)GenerateInput(dist, aligned.get(), num_lanes);
|
||||
|
||||
// The pivot value can influence performance. Do exactly what vqsort will
|
||||
// do so that the performance (influenced by prefetching and branch
|
||||
// prediction) is likely to predict the actual performance inside vqsort.
|
||||
detail::DrawSamples(d, st, aligned.get(), num_lanes, buf.get(), rng);
|
||||
detail::SortSamples(d, st, buf.get());
|
||||
auto pivot = detail::ChoosePivotByRank(d, st, buf.get());
|
||||
|
||||
const Timestamp t0;
|
||||
detail::Partition(d, st, aligned.get(), num_lanes - 1, pivot, buf.get());
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// 'Use' the result to prevent optimizing out the partition.
|
||||
sum += static_cast<double>(aligned.get()[num_lanes / 2]);
|
||||
}
|
||||
|
||||
Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
|
||||
sizeof(KeyType), st.KeyString())
|
||||
.Print();
|
||||
}
|
||||
HWY_ASSERT(sum != 999999); // Prevent optimizing out
|
||||
}
|
||||
|
||||
HWY_NOINLINE void BenchAllPartition() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3) {
|
||||
return;
|
||||
}
|
||||
|
||||
BenchPartition<TraitsLane<OrderDescending<float>>>();
|
||||
BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
|
||||
BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
|
||||
BenchPartition<Traits128<OrderAscending128>>();
|
||||
// BenchPartition<Traits128<OrderDescending128>>();
|
||||
BenchPartition<Traits128<OrderAscendingKV128>>();
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
|
||||
return;
|
||||
}
|
||||
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
const SortTag<LaneType> d;
|
||||
detail::SharedTraits<Traits> st;
|
||||
const Dist dist = Dist::kUniform32;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
const size_t num_lanes = SortConstants::BaseCaseNum(N);
|
||||
const size_t num_keys = num_lanes / st.LanesPerKey();
|
||||
auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
|
||||
|
||||
std::vector<double> seconds;
|
||||
double sum = 0; // prevents elision
|
||||
constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure
|
||||
|
||||
for (size_t rep = 0; rep < 30; ++rep) {
|
||||
InputStats<LaneType> input_stats =
|
||||
GenerateInput(dist, keys.get(), num_lanes);
|
||||
|
||||
const Timestamp t0;
|
||||
for (size_t i = 0; i < kMul; ++i) {
|
||||
detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
|
||||
buf.get());
|
||||
sum += static_cast<double>(keys[0]);
|
||||
}
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// printf("%f\n", seconds.back());
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
|
||||
}
|
||||
HWY_ASSERT(sum < 1E99);
|
||||
results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
|
||||
SummarizeMeasurements(seconds), sizeof(KeyType),
|
||||
st.KeyString());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void BenchAllBase() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<Result> results;
|
||||
BenchBase<TraitsLane<OrderAscending<float>>>(results);
|
||||
BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
|
||||
BenchBase<Traits128<OrderAscending128>>(results);
|
||||
for (const Result& r : results) {
|
||||
r.Print();
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void BenchAllPartition() {}
|
||||
void BenchAllBase() {}
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
std::vector<Algo> AlgoForBench() {
|
||||
return {
|
||||
#if HAVE_AVX2SORT
|
||||
Algo::kSEA,
|
||||
#endif
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
Algo::kParallelIPS4O,
|
||||
#elif HAVE_IPS4O
|
||||
Algo::kIPS4O,
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
Algo::kPDQ,
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
Algo::kSort512,
|
||||
#endif
|
||||
// Only include if we're compiling for the target it supports.
|
||||
#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
|
||||
(!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
|
||||
Algo::kVXSort,
|
||||
#endif
|
||||
|
||||
#if !HAVE_PARALLEL_IPS4O
|
||||
#if !SORT_100M
|
||||
// These are 10-20x slower, but that's OK for the default size when we
|
||||
// are not testing the parallel nor 100M modes.
|
||||
Algo::kStd, Algo::kHeap,
|
||||
#endif
|
||||
|
||||
Algo::kVQSort, // only ~4x slower, but not required for Table 1a
|
||||
#endif
|
||||
};
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
HWY_NOINLINE void BenchSort(size_t num_keys) {
|
||||
if (first_sort_target == 0) first_sort_target = HWY_TARGET;
|
||||
|
||||
SharedState shared;
|
||||
detail::SharedTraits<Traits> st;
|
||||
using Order = typename Traits::Order;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
const size_t num_lanes = num_keys * st.LanesPerKey();
|
||||
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
|
||||
const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
|
||||
|
||||
for (Algo algo : AlgoForBench()) {
|
||||
// Other algorithms don't depend on the vector instructions, so only run
|
||||
// them for the first target.
|
||||
#if !HAVE_VXSORT
|
||||
if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (Dist dist : AllDist()) {
|
||||
std::vector<double> seconds;
|
||||
for (size_t rep = 0; rep < reps; ++rep) {
|
||||
InputStats<LaneType> input_stats =
|
||||
GenerateInput(dist, aligned.get(), num_lanes);
|
||||
|
||||
const Timestamp t0;
|
||||
Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
|
||||
shared, /*thread=*/0);
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// printf("%f\n", seconds.back());
|
||||
|
||||
HWY_ASSERT(
|
||||
VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
|
||||
}
|
||||
Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
|
||||
sizeof(KeyType), st.KeyString())
|
||||
.Print();
|
||||
} // dist
|
||||
} // algo
|
||||
}
|
||||
|
||||
HWY_NOINLINE void BenchAllSort() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t K = 1000;
|
||||
constexpr size_t M = K * K;
|
||||
(void)K;
|
||||
(void)M;
|
||||
for (size_t num_keys : {
|
||||
#if HAVE_PARALLEL_IPS4O || SORT_100M
|
||||
100 * M,
|
||||
#else
|
||||
1 * M,
|
||||
#endif
|
||||
}) {
|
||||
BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
|
||||
BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
|
||||
BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
|
||||
|
||||
#if !HAVE_VXSORT && VQSORT_ENABLED
|
||||
BenchSort<Traits128<OrderAscending128>>(num_keys);
|
||||
BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
int64_t first_sort_target = 0; // none run yet
|
||||
namespace {
|
||||
HWY_BEFORE_TEST(BenchSort);
|
||||
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
|
||||
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
|
||||
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,191 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Based on A.7 in "Entwurf und Implementierung vektorisierter
|
||||
// Sortieralgorithmen" and code by Mark Blacher.
|
||||
void PrintMergeNetwork16x2() {
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void PrintMergeNetwork16x4() {
|
||||
printf("\n");
|
||||
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
}
|
||||
|
||||
void PrintMergeNetwork16x8() {
|
||||
printf("\n");
|
||||
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
}
|
||||
|
||||
void PrintMergeNetwork16x16() {
|
||||
printf("\n");
|
||||
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
PrintMergeNetwork16x2();
|
||||
PrintMergeNetwork16x4();
|
||||
PrintMergeNetwork16x8();
|
||||
PrintMergeNetwork16x16();
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,139 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
|
||||
// Normal include guard for non-SIMD parts
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
|
||||
|
||||
#include <time.h>
|
||||
|
||||
#include <algorithm> // std::sort
|
||||
#include <string>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
struct Timestamp {
|
||||
Timestamp() { t = platform::Now(); }
|
||||
double t;
|
||||
};
|
||||
|
||||
static inline double SecondsSince(const Timestamp& t0) {
|
||||
const Timestamp t1;
|
||||
return t1.t - t0.t;
|
||||
}
|
||||
|
||||
// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
|
||||
// enough for the mode to be reliable).
|
||||
static inline double SummarizeMeasurements(std::vector<double>& seconds) {
|
||||
std::sort(seconds.begin(), seconds.end());
|
||||
double sum = 0;
|
||||
int count = 0;
|
||||
const size_t num = seconds.size();
|
||||
for (size_t i = num / 4; i < num / 2; ++i) {
|
||||
sum += seconds[i];
|
||||
count += 1;
|
||||
}
|
||||
return sum / count;
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct Result {
|
||||
Result() {}
|
||||
Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
|
||||
double sec, size_t sizeof_key, const std::string& key_name)
|
||||
: target(HWY_TARGET),
|
||||
algo(algo),
|
||||
dist(dist),
|
||||
num_keys(num_keys),
|
||||
num_threads(num_threads),
|
||||
sec(sec),
|
||||
sizeof_key(sizeof_key),
|
||||
key_name(key_name) {}
|
||||
|
||||
void Print() const {
|
||||
const double bytes = static_cast<double>(num_keys) *
|
||||
static_cast<double>(num_threads) *
|
||||
static_cast<double>(sizeof_key);
|
||||
printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
|
||||
hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
|
||||
DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
|
||||
num_threads);
|
||||
}
|
||||
|
||||
int64_t target;
|
||||
Algo algo;
|
||||
Dist dist;
|
||||
size_t num_keys = 0;
|
||||
size_t num_threads = 0;
|
||||
double sec = 0.0;
|
||||
size_t sizeof_key = 0;
|
||||
std::string key_name;
|
||||
};
|
||||
|
||||
template <class Traits, typename LaneType>
|
||||
bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
|
||||
const LaneType* out, size_t num_lanes, const char* caller) {
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
HWY_ASSERT(num_lanes >= N1);
|
||||
|
||||
InputStats<LaneType> output_stats;
|
||||
// Ensure it matches the sort order
|
||||
for (size_t i = 0; i < num_lanes - N1; i += N1) {
|
||||
output_stats.Notify(out[i]);
|
||||
if (N1 == 2) output_stats.Notify(out[i + 1]);
|
||||
// Reverse order instead of checking !Compare1 so we accept equal keys.
|
||||
if (st.Compare1(out + i + N1, out + i)) {
|
||||
printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
|
||||
caller, static_cast<int>(i), static_cast<int>(num_lanes),
|
||||
static_cast<int>(N1), static_cast<double>(out[i + 1]),
|
||||
static_cast<double>(out[i + 0]),
|
||||
static_cast<double>(out[i + N1 + 1]),
|
||||
static_cast<double>(out[i + N1]));
|
||||
HWY_ABORT("%d-bit sort is incorrect\n",
|
||||
static_cast<int>(sizeof(LaneType) * 8 * N1));
|
||||
}
|
||||
}
|
||||
output_stats.Notify(out[num_lanes - N1]);
|
||||
if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
|
||||
|
||||
return input_stats == output_stats;
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
||||
@@ -0,0 +1,133 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Definitions shared between vqsort-inl and sorting_networks-inl.
|
||||
|
||||
// Normal include guard for target-independent parts
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Internal constants - these are to avoid magic numbers/literals and cannot be
|
||||
// changed without also changing the associated code.
|
||||
struct SortConstants {
|
||||
// SortingNetwork reshapes its input into a matrix. This is the maximum number
|
||||
// of *keys* per vector.
|
||||
#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow
|
||||
#else
|
||||
static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
|
||||
#endif
|
||||
|
||||
// 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
|
||||
// fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
|
||||
// code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
|
||||
// extra logN factor for larger networks (for which only loose upper bounds
|
||||
// on size are known).
|
||||
static constexpr size_t kMaxRowsLog2 = 4;
|
||||
static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
|
||||
|
||||
static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
|
||||
return kMaxRows * HWY_MIN(N, kMaxCols);
|
||||
}
|
||||
|
||||
// Unrolling is important (pipelining and amortizing branch mispredictions);
|
||||
// 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
|
||||
// somewhat slower for sorting than 4x.
|
||||
//
|
||||
// To change, must also update left + 3 * N etc. in the loop.
|
||||
static constexpr size_t kPartitionUnroll = 4;
|
||||
|
||||
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
|
||||
// The main loop reads kPartitionUnroll vectors, and first loads from
|
||||
// both left and right beforehand, so it requires min = 2 *
|
||||
// kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
|
||||
// >= BaseCaseNum), we partition the right side into a buffer. We need
|
||||
// another vector at the end so CompressStore does not overwrite anything.
|
||||
return (2 * kPartitionUnroll + 1) * N;
|
||||
}
|
||||
|
||||
// Chunk := group of keys loaded for sampling a pivot. Matches the typical
|
||||
// cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
|
||||
// are larger, use entire vectors to ensure we do not overrun the array.
|
||||
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
|
||||
return HWY_MAX(64 / sizeof_t, N);
|
||||
}
|
||||
|
||||
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
|
||||
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
|
||||
return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static constexpr HWY_INLINE size_t BufNum(size_t N) {
|
||||
// One extra for padding plus another for full-vector loads.
|
||||
return HWY_MAX(BaseCaseNum(N) + 2 * N,
|
||||
HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
|
||||
return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
|
||||
// Arm v7 debug.
|
||||
#undef VQSORT_ENABLED
|
||||
#if (HWY_TARGET == HWY_SCALAR) || \
|
||||
(HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
|
||||
(HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
|
||||
#define VQSORT_ENABLED 0
|
||||
#else
|
||||
#define VQSORT_ENABLED 1
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Default tag / vector width selector.
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
|
||||
template <typename T>
|
||||
using SortTag = ScalableTag<T, -1>;
|
||||
#else
|
||||
template <typename T>
|
||||
using SortTag = ScalableTag<T>;
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
||||
@@ -0,0 +1,626 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __STDC_FORMAT_MACROS
|
||||
#define __STDC_FORMAT_MACROS // before inttypes.h
|
||||
#endif
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
|
||||
using detail::OrderAscending;
|
||||
using detail::OrderDescending;
|
||||
using detail::SharedTraits;
|
||||
using detail::TraitsLane;
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderAscendingKV128;
|
||||
using detail::OrderAscendingKV64;
|
||||
using detail::OrderDescending128;
|
||||
using detail::OrderDescendingKV128;
|
||||
using detail::OrderDescendingKV64;
|
||||
using detail::Traits128;
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestMedian3() {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using D = CappedTag<LaneType, 1>;
|
||||
SharedTraits<Traits> st;
|
||||
const D d;
|
||||
using V = Vec<D>;
|
||||
for (uint32_t bits = 0; bits < 8; ++bits) {
|
||||
const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
|
||||
const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
|
||||
const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
|
||||
const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
|
||||
// If at least half(rounded up) of bits are 1, so is the median.
|
||||
const size_t count = PopCount(bits);
|
||||
HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllMedian() {
|
||||
TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestBaseCaseAscDesc() {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
SharedTraits<Traits> st;
|
||||
const SortTag<LaneType> d;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
|
||||
constexpr int kDebug = 0;
|
||||
auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
|
||||
|
||||
std::vector<size_t> lengths;
|
||||
lengths.push_back(HWY_MAX(1, N1));
|
||||
lengths.push_back(3 * N1);
|
||||
lengths.push_back(base_case_num / 2);
|
||||
lengths.push_back(base_case_num / 2 + N1);
|
||||
lengths.push_back(base_case_num - N1);
|
||||
lengths.push_back(base_case_num);
|
||||
|
||||
std::vector<size_t> misalignments;
|
||||
misalignments.push_back(0);
|
||||
misalignments.push_back(1);
|
||||
if (N >= 6) misalignments.push_back(N / 2 - 1);
|
||||
misalignments.push_back(N / 2);
|
||||
misalignments.push_back(N / 2 + 1);
|
||||
misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
|
||||
|
||||
for (bool asc : {false, true}) {
|
||||
for (size_t len : lengths) {
|
||||
for (size_t misalign : misalignments) {
|
||||
LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
|
||||
if (kDebug) {
|
||||
printf("============%s asc %d N1 %d len %d misalign %d\n",
|
||||
st.KeyString().c_str(), asc, static_cast<int>(N1),
|
||||
static_cast<int>(len), static_cast<int>(misalign));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned_lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
InputStats<LaneType> input_stats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
|
||||
: static_cast<LaneType>(LaneType(len) - LaneType(i));
|
||||
input_stats.Notify(lanes[i]);
|
||||
if (kDebug >= 2) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
|
||||
detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
} // len
|
||||
} // asc
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestBaseCase01() {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
SharedTraits<Traits> st;
|
||||
const SortTag<LaneType> d;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
|
||||
constexpr int kDebug = 0;
|
||||
auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
|
||||
|
||||
std::vector<size_t> lengths;
|
||||
lengths.push_back(HWY_MAX(1, N1));
|
||||
lengths.push_back(3 * N1);
|
||||
lengths.push_back(base_case_num / 2);
|
||||
lengths.push_back(base_case_num / 2 + N1);
|
||||
lengths.push_back(base_case_num - N1);
|
||||
lengths.push_back(base_case_num);
|
||||
|
||||
for (size_t len : lengths) {
|
||||
if (kDebug) {
|
||||
printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
|
||||
static_cast<int>(N1), static_cast<int>(len));
|
||||
}
|
||||
const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
|
||||
for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
|
||||
InputStats<LaneType> input_stats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
|
||||
input_stats.Notify(lanes[i]);
|
||||
if (kDebug >= 2) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
|
||||
detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // bits
|
||||
} // len
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestBaseCase() {
|
||||
TestBaseCaseAscDesc<Traits>();
|
||||
TestBaseCase01<Traits>();
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllBaseCase() {
|
||||
// Workaround for stack overflow on MSVC debug.
|
||||
#if defined(_MSC_VER)
|
||||
return;
|
||||
#endif
|
||||
TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
|
||||
TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
|
||||
TestBaseCase<Traits128<OrderAscending128> >();
|
||||
TestBaseCase<Traits128<OrderDescending128> >();
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void VerifyPartition(
|
||||
Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
|
||||
size_t border, size_t right, const size_t N1,
|
||||
const typename Traits::LaneType* pivot) {
|
||||
/* for (size_t i = left; i < right; ++i) {
|
||||
if (i == border) printf("--\n");
|
||||
printf("%4zu: %3d\n", i, lanes[i]);
|
||||
}*/
|
||||
|
||||
HWY_ASSERT(left % N1 == 0);
|
||||
HWY_ASSERT(border % N1 == 0);
|
||||
HWY_ASSERT(right % N1 == 0);
|
||||
const bool asc = typename Traits::Order().IsAscending();
|
||||
for (size_t i = left; i < border; i += N1) {
|
||||
if (st.Compare1(pivot, lanes + i)) {
|
||||
HWY_ABORT(
|
||||
"%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
|
||||
"border %d",
|
||||
st.KeyString().c_str(), asc, static_cast<int>(i),
|
||||
static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
|
||||
static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
|
||||
static_cast<int>(border));
|
||||
}
|
||||
}
|
||||
for (size_t i = border; i < right; i += N1) {
|
||||
if (!st.Compare1(pivot, lanes + i)) {
|
||||
HWY_ABORT(
|
||||
"%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
|
||||
"border %d",
|
||||
st.KeyString().c_str(), asc, static_cast<int>(i),
|
||||
static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
|
||||
static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
|
||||
static_cast<int>(border));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestPartition() {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
const SortTag<LaneType> d;
|
||||
SharedTraits<Traits> st;
|
||||
const bool asc = typename Traits::Order().IsAscending();
|
||||
const size_t N = Lanes(d);
|
||||
constexpr int kDebug = 0;
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
// left + len + align
|
||||
const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
|
||||
auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));
|
||||
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
for (bool in_asc : {false, true}) {
|
||||
for (int left_i : {0, 1, 4, 6, 7, 8, 12, 15, 22, 28, 30, 31}) {
|
||||
const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
|
||||
for (size_t ofs : {N, N + 1, N + 3, 2 * N, 2 * N + 2, 2 * N + 3,
|
||||
3 * N - 1, 4 * N - 3, 4 * N - 2}) {
|
||||
const size_t len = (base_case_num + ofs) & ~(N1 - 1);
|
||||
for (LaneType pivot1 :
|
||||
{LaneType(0), LaneType(len / 3), LaneType(len / 2),
|
||||
LaneType(2 * len / 3), LaneType(len)}) {
|
||||
const LaneType pivot2[2] = {pivot1, 0};
|
||||
const auto pivot = st.SetKey(d, pivot2);
|
||||
for (size_t misalign = 0; misalign < N;
|
||||
misalign += st.LanesPerKey()) {
|
||||
LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
|
||||
const size_t right = left + len;
|
||||
if (kDebug) {
|
||||
printf(
|
||||
"=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
|
||||
st.KeyString().c_str(), asc, static_cast<int>(left),
|
||||
static_cast<int>(len), static_cast<int>(right),
|
||||
static_cast<double>(pivot2[1]),
|
||||
static_cast<double>(pivot2[0]));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned_lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
for (size_t i = 0; i < left; ++i) {
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
std::unordered_map<LaneType, int> counts;
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
lanes[i] = static_cast<LaneType>(
|
||||
in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
|
||||
: static_cast<LaneType>(right) - LaneType(i));
|
||||
++counts[lanes[i]];
|
||||
if (kDebug >= 2) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
|
||||
size_t border =
|
||||
left + detail::Partition(d, st, lanes + left, right - left,
|
||||
pivot, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
--counts[lanes[i]];
|
||||
}
|
||||
for (auto kv : counts) {
|
||||
if (kv.second != 0) {
|
||||
PrintValue(kv.first);
|
||||
HWY_ABORT("Incorrect count %d\n", kv.second);
|
||||
}
|
||||
}
|
||||
VerifyPartition(st, lanes, left, border, right, N1, pivot2);
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = 0; i < left; ++i) {
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
} // pivot
|
||||
} // len
|
||||
} // left
|
||||
} // asc
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPartition() {
|
||||
TestPartition<TraitsLane<OrderDescending<int32_t> > >();
|
||||
TestPartition<Traits128<OrderAscending128> >();
|
||||
|
||||
#if !HWY_IS_DEBUG_BUILD
|
||||
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
|
||||
TestPartition<TraitsLane<OrderAscending<int64_t> > >();
|
||||
TestPartition<TraitsLane<OrderDescending<float> > >();
|
||||
#if HWY_HAVE_FLOAT64
|
||||
TestPartition<TraitsLane<OrderDescending<double> > >();
|
||||
#endif
|
||||
TestPartition<Traits128<OrderDescending128> >();
|
||||
#endif
|
||||
}
|
||||
|
||||
// (used for sample selection for choosing a pivot)
|
||||
template <typename TU>
|
||||
static HWY_NOINLINE void TestRandomGenerator() {
|
||||
static_assert(!hwy::IsSigned<TU>(), "");
|
||||
SortTag<TU> du;
|
||||
const size_t N = Lanes(du);
|
||||
|
||||
detail::Generator rng(&N, N);
|
||||
|
||||
const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two
|
||||
|
||||
for (uint32_t num_blocks = 2; num_blocks < 100000;
|
||||
num_blocks = 3 * num_blocks / 2) {
|
||||
// Generate some numbers and ensure all are in range
|
||||
uint64_t sum = 0;
|
||||
constexpr size_t kReps = 10000;
|
||||
for (size_t rep = 0; rep < kReps; ++rep) {
|
||||
const uint32_t bits = rng() & 0xFFFFFFFF;
|
||||
const size_t index = detail::RandomChunkIndex(num_blocks, bits);
|
||||
HWY_ASSERT(((index + 1) * lanes_per_block) <=
|
||||
num_blocks * lanes_per_block);
|
||||
|
||||
sum += index;
|
||||
}
|
||||
|
||||
// Also ensure the mean is near the middle of the range
|
||||
const double expected = (num_blocks - 1) / 2.0;
|
||||
const double actual = static_cast<double>(sum) / kReps;
|
||||
HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllGenerator() {
|
||||
TestRandomGenerator<uint32_t>();
|
||||
TestRandomGenerator<uint64_t>();
|
||||
}
|
||||
|
||||
#else
|
||||
static void TestAllMedian() {}
|
||||
static void TestAllBaseCase() {}
|
||||
static void TestAllPartition() {}
|
||||
static void TestAllGenerator() {}
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
// Remembers input, and compares results to that of a reference algorithm.
|
||||
template <class Traits>
|
||||
class CompareResults {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
|
||||
public:
|
||||
CompareResults(const LaneType* in, size_t num_lanes) {
|
||||
copy_.resize(num_lanes);
|
||||
memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
|
||||
}
|
||||
|
||||
bool Verify(const LaneType* output) {
|
||||
#if HAVE_PDQSORT
|
||||
const Algo reference = Algo::kPDQ;
|
||||
#else
|
||||
const Algo reference = Algo::kStd;
|
||||
#endif
|
||||
SharedState shared;
|
||||
using Order = typename Traits::Order;
|
||||
const Traits st;
|
||||
const size_t num_keys = copy_.size() / st.LanesPerKey();
|
||||
Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
|
||||
shared, /*thread=*/0);
|
||||
#if VQSORT_PRINT >= 3
|
||||
fprintf(stderr, "\nExpected:\n");
|
||||
for (size_t i = 0; i < copy_.size(); ++i) {
|
||||
PrintValue(copy_[i]);
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
for (size_t i = 0; i < copy_.size(); ++i) {
|
||||
if (copy_[i] != output[i]) {
|
||||
if (sizeof(KeyType) == 16) {
|
||||
fprintf(stderr,
|
||||
"%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
|
||||
st.KeyString().c_str(), Order().IsAscending(),
|
||||
static_cast<int>(i), static_cast<int>(copy_.size()),
|
||||
static_cast<uint64_t>(copy_[i]),
|
||||
static_cast<uint64_t>(output[i]));
|
||||
} else {
|
||||
fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
|
||||
st.KeyString().c_str(), Order().IsAscending(),
|
||||
static_cast<int>(i), static_cast<int>(copy_.size()));
|
||||
PrintValue(copy_[i]);
|
||||
PrintValue(output[i]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<LaneType> copy_;
|
||||
};
|
||||
|
||||
std::vector<Algo> AlgoForTest() {
|
||||
return {
|
||||
#if HAVE_AVX2SORT
|
||||
Algo::kSEA,
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
Algo::kIPS4O,
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
Algo::kPDQ,
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
Algo::kSort512,
|
||||
#endif
|
||||
Algo::kHeap, Algo::kVQSort,
|
||||
};
|
||||
}
|
||||
|
||||
template <class Traits>
|
||||
void TestSort(size_t num_lanes) {
|
||||
// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
|
||||
#if defined(_MSC_VER)
|
||||
return;
|
||||
#endif
|
||||
using Order = typename Traits::Order;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
SharedState shared;
|
||||
SharedTraits<Traits> st;
|
||||
|
||||
// Round up to a whole number of keys.
|
||||
num_lanes += (st.Is128() && (num_lanes & 1));
|
||||
const size_t num_keys = num_lanes / st.LanesPerKey();
|
||||
|
||||
constexpr size_t kMaxMisalign = 16;
|
||||
auto aligned =
|
||||
hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
|
||||
for (Algo algo : AlgoForTest()) {
|
||||
for (Dist dist : AllDist()) {
|
||||
for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
|
||||
size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
|
||||
LaneType* lanes = aligned.get() + misalign;
|
||||
|
||||
// Set up red zones before/after the keys to sort
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
for (size_t i = 0; i < kMaxMisalign; ++i) {
|
||||
lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
|
||||
}
|
||||
#if HWY_IS_MSAN
|
||||
__msan_poison(aligned.get(), misalign * sizeof(LaneType));
|
||||
__msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
|
||||
#endif
|
||||
InputStats<LaneType> input_stats =
|
||||
GenerateInput(dist, lanes, num_lanes);
|
||||
|
||||
CompareResults<Traits> compare(lanes, num_lanes);
|
||||
Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
|
||||
/*thread=*/0);
|
||||
HWY_ASSERT(compare.Verify(lanes));
|
||||
HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
|
||||
|
||||
// Check red zones
|
||||
#if HWY_IS_MSAN
|
||||
__msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
|
||||
__msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
|
||||
#endif
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
|
||||
if (lanes[i] != hwy::HighestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
} // dist
|
||||
} // algo
|
||||
}
|
||||
|
||||
void TestAllSort() {
|
||||
for (int num : {129, 504, 3 * 1000, 34567}) {
|
||||
const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
|
||||
TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
|
||||
TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
|
||||
|
||||
TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
|
||||
TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);
|
||||
|
||||
TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
|
||||
TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
|
||||
|
||||
// WARNING: for float types, SIMD comparisons will flush denormals to
|
||||
// zero, causing mismatches with scalar sorts. In this test, we avoid
|
||||
// generating denormal inputs.
|
||||
TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
|
||||
#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom
|
||||
if (Sorter::HaveFloat64()) {
|
||||
TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Our HeapSort does not support 128-bit keys.
|
||||
#if VQSORT_ENABLED
|
||||
TestSort<Traits128<OrderAscending128> >(num_lanes);
|
||||
TestSort<Traits128<OrderDescending128> >(num_lanes);
|
||||
|
||||
TestSort<TraitsLane<OrderAscendingKV64> >(num_lanes);
|
||||
TestSort<TraitsLane<OrderDescendingKV64> >(num_lanes);
|
||||
|
||||
TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
|
||||
TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_BEFORE_TEST(SortTest);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,695 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if VQSORT_ENABLED
|
||||
|
||||
using Constants = hwy::SortConstants;
|
||||
|
||||
// ------------------------------ SharedTraits
|
||||
|
||||
// Code shared between all traits. It's unclear whether these can profitably be
|
||||
// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
|
||||
// Compare/DupOdd.
|
||||
template <class Base>
|
||||
struct SharedTraits : public Base {
|
||||
// Conditionally swaps lane 0 with 2, 1 with 3 etc.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->SwapAdjacentPairs(d, v);
|
||||
base->Sort2(d, v, swapped);
|
||||
return base->OddEvenPairs(d, swapped, v);
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys8(d, v);
|
||||
base->Sort2(d, v, swapped);
|
||||
return base->OddEvenQuads(d, swapped, v);
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
|
||||
Vec<D> swapped = base->ReverseKeys(d, v);
|
||||
base->Sort2(d, v, swapped);
|
||||
return ConcatUpperLower(d, swapped, v); // 8 = half of the vector
|
||||
}
|
||||
};
|
||||
|
||||
// ------------------------------ Sorting network
|
||||
|
||||
// (Green's irregular) sorting network for independent columns in 16 vectors.
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
st.Sort2(d, v0, v2);
|
||||
st.Sort2(d, v1, v3);
|
||||
st.Sort2(d, v4, v6);
|
||||
st.Sort2(d, v5, v7);
|
||||
st.Sort2(d, v8, va);
|
||||
st.Sort2(d, v9, vb);
|
||||
st.Sort2(d, vc, ve);
|
||||
st.Sort2(d, vd, vf);
|
||||
st.Sort2(d, v0, v4);
|
||||
st.Sort2(d, v1, v5);
|
||||
st.Sort2(d, v2, v6);
|
||||
st.Sort2(d, v3, v7);
|
||||
st.Sort2(d, v8, vc);
|
||||
st.Sort2(d, v9, vd);
|
||||
st.Sort2(d, va, ve);
|
||||
st.Sort2(d, vb, vf);
|
||||
st.Sort2(d, v0, v8);
|
||||
st.Sort2(d, v1, v9);
|
||||
st.Sort2(d, v2, va);
|
||||
st.Sort2(d, v3, vb);
|
||||
st.Sort2(d, v4, vc);
|
||||
st.Sort2(d, v5, vd);
|
||||
st.Sort2(d, v6, ve);
|
||||
st.Sort2(d, v7, vf);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v7, vb);
|
||||
st.Sort2(d, vd, ve);
|
||||
st.Sort2(d, v4, v8);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v1, v4);
|
||||
st.Sort2(d, v7, vd);
|
||||
st.Sort2(d, v2, v8);
|
||||
st.Sort2(d, vb, ve);
|
||||
st.Sort2(d, v2, v4);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vb, vd);
|
||||
st.Sort2(d, v3, v8);
|
||||
st.Sort2(d, v7, vc);
|
||||
st.Sort2(d, v3, v5);
|
||||
st.Sort2(d, v6, v8);
|
||||
st.Sort2(d, v7, v9);
|
||||
st.Sort2(d, va, vc);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v7, v8);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vb, vc);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
}
|
||||
|
||||
// ------------------------------ Merging networks
|
||||
|
||||
// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys2(d, v8);
|
||||
v9 = st.ReverseKeys2(d, v9);
|
||||
va = st.ReverseKeys2(d, va);
|
||||
vb = st.ReverseKeys2(d, vb);
|
||||
vc = st.ReverseKeys2(d, vc);
|
||||
vd = st.ReverseKeys2(d, vd);
|
||||
ve = st.ReverseKeys2(d, ve);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys2(d, v4);
|
||||
vc = st.ReverseKeys2(d, vc);
|
||||
v5 = st.ReverseKeys2(d, v5);
|
||||
vd = st.ReverseKeys2(d, vd);
|
||||
v6 = st.ReverseKeys2(d, v6);
|
||||
ve = st.ReverseKeys2(d, ve);
|
||||
v7 = st.ReverseKeys2(d, v7);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys2(d, v2);
|
||||
v3 = st.ReverseKeys2(d, v3);
|
||||
v6 = st.ReverseKeys2(d, v6);
|
||||
v7 = st.ReverseKeys2(d, v7);
|
||||
va = st.ReverseKeys2(d, va);
|
||||
vb = st.ReverseKeys2(d, vb);
|
||||
ve = st.ReverseKeys2(d, ve);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys2(d, v1);
|
||||
v3 = st.ReverseKeys2(d, v3);
|
||||
v5 = st.ReverseKeys2(d, v5);
|
||||
v7 = st.ReverseKeys2(d, v7);
|
||||
v9 = st.ReverseKeys2(d, v9);
|
||||
vb = st.ReverseKeys2(d, vb);
|
||||
vd = st.ReverseKeys2(d, vd);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys4(d, v8);
|
||||
v9 = st.ReverseKeys4(d, v9);
|
||||
va = st.ReverseKeys4(d, va);
|
||||
vb = st.ReverseKeys4(d, vb);
|
||||
vc = st.ReverseKeys4(d, vc);
|
||||
vd = st.ReverseKeys4(d, vd);
|
||||
ve = st.ReverseKeys4(d, ve);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys4(d, v4);
|
||||
vc = st.ReverseKeys4(d, vc);
|
||||
v5 = st.ReverseKeys4(d, v5);
|
||||
vd = st.ReverseKeys4(d, vd);
|
||||
v6 = st.ReverseKeys4(d, v6);
|
||||
ve = st.ReverseKeys4(d, ve);
|
||||
v7 = st.ReverseKeys4(d, v7);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys4(d, v2);
|
||||
v3 = st.ReverseKeys4(d, v3);
|
||||
v6 = st.ReverseKeys4(d, v6);
|
||||
v7 = st.ReverseKeys4(d, v7);
|
||||
va = st.ReverseKeys4(d, va);
|
||||
vb = st.ReverseKeys4(d, vb);
|
||||
ve = st.ReverseKeys4(d, ve);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys4(d, v1);
|
||||
v3 = st.ReverseKeys4(d, v3);
|
||||
v5 = st.ReverseKeys4(d, v5);
|
||||
v7 = st.ReverseKeys4(d, v7);
|
||||
v9 = st.ReverseKeys4(d, v9);
|
||||
vb = st.ReverseKeys4(d, vb);
|
||||
vd = st.ReverseKeys4(d, vd);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsReverse4(d, v0);
|
||||
v1 = st.SortPairsReverse4(d, v1);
|
||||
v2 = st.SortPairsReverse4(d, v2);
|
||||
v3 = st.SortPairsReverse4(d, v3);
|
||||
v4 = st.SortPairsReverse4(d, v4);
|
||||
v5 = st.SortPairsReverse4(d, v5);
|
||||
v6 = st.SortPairsReverse4(d, v6);
|
||||
v7 = st.SortPairsReverse4(d, v7);
|
||||
v8 = st.SortPairsReverse4(d, v8);
|
||||
v9 = st.SortPairsReverse4(d, v9);
|
||||
va = st.SortPairsReverse4(d, va);
|
||||
vb = st.SortPairsReverse4(d, vb);
|
||||
vc = st.SortPairsReverse4(d, vc);
|
||||
vd = st.SortPairsReverse4(d, vd);
|
||||
ve = st.SortPairsReverse4(d, ve);
|
||||
vf = st.SortPairsReverse4(d, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys8(d, v8);
|
||||
v9 = st.ReverseKeys8(d, v9);
|
||||
va = st.ReverseKeys8(d, va);
|
||||
vb = st.ReverseKeys8(d, vb);
|
||||
vc = st.ReverseKeys8(d, vc);
|
||||
vd = st.ReverseKeys8(d, vd);
|
||||
ve = st.ReverseKeys8(d, ve);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys8(d, v4);
|
||||
vc = st.ReverseKeys8(d, vc);
|
||||
v5 = st.ReverseKeys8(d, v5);
|
||||
vd = st.ReverseKeys8(d, vd);
|
||||
v6 = st.ReverseKeys8(d, v6);
|
||||
ve = st.ReverseKeys8(d, ve);
|
||||
v7 = st.ReverseKeys8(d, v7);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys8(d, v2);
|
||||
v3 = st.ReverseKeys8(d, v3);
|
||||
v6 = st.ReverseKeys8(d, v6);
|
||||
v7 = st.ReverseKeys8(d, v7);
|
||||
va = st.ReverseKeys8(d, va);
|
||||
vb = st.ReverseKeys8(d, vb);
|
||||
ve = st.ReverseKeys8(d, ve);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys8(d, v1);
|
||||
v3 = st.ReverseKeys8(d, v3);
|
||||
v5 = st.ReverseKeys8(d, v5);
|
||||
v7 = st.ReverseKeys8(d, v7);
|
||||
v9 = st.ReverseKeys8(d, v9);
|
||||
vb = st.ReverseKeys8(d, vb);
|
||||
vd = st.ReverseKeys8(d, vd);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsReverse8(d, v0);
|
||||
v1 = st.SortPairsReverse8(d, v1);
|
||||
v2 = st.SortPairsReverse8(d, v2);
|
||||
v3 = st.SortPairsReverse8(d, v3);
|
||||
v4 = st.SortPairsReverse8(d, v4);
|
||||
v5 = st.SortPairsReverse8(d, v5);
|
||||
v6 = st.SortPairsReverse8(d, v6);
|
||||
v7 = st.SortPairsReverse8(d, v7);
|
||||
v8 = st.SortPairsReverse8(d, v8);
|
||||
v9 = st.SortPairsReverse8(d, v9);
|
||||
va = st.SortPairsReverse8(d, va);
|
||||
vb = st.SortPairsReverse8(d, vb);
|
||||
vc = st.SortPairsReverse8(d, vc);
|
||||
vd = st.SortPairsReverse8(d, vd);
|
||||
ve = st.SortPairsReverse8(d, ve);
|
||||
vf = st.SortPairsReverse8(d, vf);
|
||||
v0 = st.SortPairsDistance2(d, v0);
|
||||
v1 = st.SortPairsDistance2(d, v1);
|
||||
v2 = st.SortPairsDistance2(d, v2);
|
||||
v3 = st.SortPairsDistance2(d, v3);
|
||||
v4 = st.SortPairsDistance2(d, v4);
|
||||
v5 = st.SortPairsDistance2(d, v5);
|
||||
v6 = st.SortPairsDistance2(d, v6);
|
||||
v7 = st.SortPairsDistance2(d, v7);
|
||||
v8 = st.SortPairsDistance2(d, v8);
|
||||
v9 = st.SortPairsDistance2(d, v9);
|
||||
va = st.SortPairsDistance2(d, va);
|
||||
vb = st.SortPairsDistance2(d, vb);
|
||||
vc = st.SortPairsDistance2(d, vc);
|
||||
vd = st.SortPairsDistance2(d, vd);
|
||||
ve = st.SortPairsDistance2(d, ve);
|
||||
vf = st.SortPairsDistance2(d, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
// Unused on MSVC, see below
|
||||
#if !HWY_COMPILER_MSVC
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
|
||||
V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
|
||||
V& vd, V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys16(d, v8);
|
||||
v9 = st.ReverseKeys16(d, v9);
|
||||
va = st.ReverseKeys16(d, va);
|
||||
vb = st.ReverseKeys16(d, vb);
|
||||
vc = st.ReverseKeys16(d, vc);
|
||||
vd = st.ReverseKeys16(d, vd);
|
||||
ve = st.ReverseKeys16(d, ve);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys16(d, v4);
|
||||
vc = st.ReverseKeys16(d, vc);
|
||||
v5 = st.ReverseKeys16(d, v5);
|
||||
vd = st.ReverseKeys16(d, vd);
|
||||
v6 = st.ReverseKeys16(d, v6);
|
||||
ve = st.ReverseKeys16(d, ve);
|
||||
v7 = st.ReverseKeys16(d, v7);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys16(d, v2);
|
||||
v3 = st.ReverseKeys16(d, v3);
|
||||
v6 = st.ReverseKeys16(d, v6);
|
||||
v7 = st.ReverseKeys16(d, v7);
|
||||
va = st.ReverseKeys16(d, va);
|
||||
vb = st.ReverseKeys16(d, vb);
|
||||
ve = st.ReverseKeys16(d, ve);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys16(d, v1);
|
||||
v3 = st.ReverseKeys16(d, v3);
|
||||
v5 = st.ReverseKeys16(d, v5);
|
||||
v7 = st.ReverseKeys16(d, v7);
|
||||
v9 = st.ReverseKeys16(d, v9);
|
||||
vb = st.ReverseKeys16(d, vb);
|
||||
vd = st.ReverseKeys16(d, vd);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsReverse16(d, v0);
|
||||
v1 = st.SortPairsReverse16(d, v1);
|
||||
v2 = st.SortPairsReverse16(d, v2);
|
||||
v3 = st.SortPairsReverse16(d, v3);
|
||||
v4 = st.SortPairsReverse16(d, v4);
|
||||
v5 = st.SortPairsReverse16(d, v5);
|
||||
v6 = st.SortPairsReverse16(d, v6);
|
||||
v7 = st.SortPairsReverse16(d, v7);
|
||||
v8 = st.SortPairsReverse16(d, v8);
|
||||
v9 = st.SortPairsReverse16(d, v9);
|
||||
va = st.SortPairsReverse16(d, va);
|
||||
vb = st.SortPairsReverse16(d, vb);
|
||||
vc = st.SortPairsReverse16(d, vc);
|
||||
vd = st.SortPairsReverse16(d, vd);
|
||||
ve = st.SortPairsReverse16(d, ve);
|
||||
vf = st.SortPairsReverse16(d, vf);
|
||||
v0 = st.SortPairsDistance4(d, v0);
|
||||
v1 = st.SortPairsDistance4(d, v1);
|
||||
v2 = st.SortPairsDistance4(d, v2);
|
||||
v3 = st.SortPairsDistance4(d, v3);
|
||||
v4 = st.SortPairsDistance4(d, v4);
|
||||
v5 = st.SortPairsDistance4(d, v5);
|
||||
v6 = st.SortPairsDistance4(d, v6);
|
||||
v7 = st.SortPairsDistance4(d, v7);
|
||||
v8 = st.SortPairsDistance4(d, v8);
|
||||
v9 = st.SortPairsDistance4(d, v9);
|
||||
va = st.SortPairsDistance4(d, va);
|
||||
vb = st.SortPairsDistance4(d, vb);
|
||||
vc = st.SortPairsDistance4(d, vc);
|
||||
vd = st.SortPairsDistance4(d, vd);
|
||||
ve = st.SortPairsDistance4(d, ve);
|
||||
vf = st.SortPairsDistance4(d, vf);
|
||||
v0 = st.SortPairsDistance2(d, v0);
|
||||
v1 = st.SortPairsDistance2(d, v1);
|
||||
v2 = st.SortPairsDistance2(d, v2);
|
||||
v3 = st.SortPairsDistance2(d, v3);
|
||||
v4 = st.SortPairsDistance2(d, v4);
|
||||
v5 = st.SortPairsDistance2(d, v5);
|
||||
v6 = st.SortPairsDistance2(d, v6);
|
||||
v7 = st.SortPairsDistance2(d, v7);
|
||||
v8 = st.SortPairsDistance2(d, v8);
|
||||
v9 = st.SortPairsDistance2(d, v9);
|
||||
va = st.SortPairsDistance2(d, va);
|
||||
vb = st.SortPairsDistance2(d, vb);
|
||||
vc = st.SortPairsDistance2(d, vc);
|
||||
vd = st.SortPairsDistance2(d, vd);
|
||||
ve = st.SortPairsDistance2(d, ve);
|
||||
vf = st.SortPairsDistance2(d, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC
|
||||
|
||||
// Reshapes `buf` into a matrix, sorts columns independently, and then merges
|
||||
// into a sorted 1D array without transposing.
|
||||
//
|
||||
// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
|
||||
// differences in sort order and single-lane vs 128-bit keys.
|
||||
// `buf` ensures full vectors are aligned, and enables loads/stores without
|
||||
// bounds checks.
|
||||
//
|
||||
// NOINLINE because this is large and called twice from vqsort-inl.h.
|
||||
//
|
||||
// References:
|
||||
// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
|
||||
// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
|
||||
// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
|
||||
template <class Traits, typename T>
|
||||
HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
|
||||
const CappedTag<T, Constants::kMaxCols> d;
|
||||
using V = decltype(Zero(d));
|
||||
|
||||
HWY_DASSERT(cols <= Constants::kMaxCols);
|
||||
|
||||
// The network width depends on the number of keys, not lanes.
|
||||
constexpr size_t kLanesPerKey = st.LanesPerKey();
|
||||
const size_t keys = cols / kLanesPerKey;
|
||||
constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
|
||||
|
||||
// These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
|
||||
// offsets to duplicating this code for every value of cols.
|
||||
static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
|
||||
V v0 = LoadU(d, buf + 0x0 * cols);
|
||||
V v1 = LoadU(d, buf + 0x1 * cols);
|
||||
V v2 = LoadU(d, buf + 0x2 * cols);
|
||||
V v3 = LoadU(d, buf + 0x3 * cols);
|
||||
V v4 = LoadU(d, buf + 0x4 * cols);
|
||||
V v5 = LoadU(d, buf + 0x5 * cols);
|
||||
V v6 = LoadU(d, buf + 0x6 * cols);
|
||||
V v7 = LoadU(d, buf + 0x7 * cols);
|
||||
V v8 = LoadU(d, buf + 0x8 * cols);
|
||||
V v9 = LoadU(d, buf + 0x9 * cols);
|
||||
V va = LoadU(d, buf + 0xa * cols);
|
||||
V vb = LoadU(d, buf + 0xb * cols);
|
||||
V vc = LoadU(d, buf + 0xc * cols);
|
||||
V vd = LoadU(d, buf + 0xd * cols);
|
||||
V ve = LoadU(d, buf + 0xe * cols);
|
||||
V vf = LoadU(d, buf + 0xf * cols);
|
||||
|
||||
Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
|
||||
|
||||
// Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
|
||||
// code paths: if MaxLanes < 2, then keys <= cols < 2.
|
||||
if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
|
||||
Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
|
||||
vf);
|
||||
|
||||
if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
|
||||
Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
|
||||
vf);
|
||||
|
||||
if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
|
||||
Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
|
||||
ve, vf);
|
||||
|
||||
// Avoids build timeout. Must match #if condition in kMaxCols.
|
||||
#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
|
||||
if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
|
||||
Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
|
||||
ve, vf);
|
||||
|
||||
static_assert(Constants::kMaxCols <= 16, "Add more branches");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
StoreU(v0, d, buf + 0x0 * cols);
|
||||
StoreU(v1, d, buf + 0x1 * cols);
|
||||
StoreU(v2, d, buf + 0x2 * cols);
|
||||
StoreU(v3, d, buf + 0x3 * cols);
|
||||
StoreU(v4, d, buf + 0x4 * cols);
|
||||
StoreU(v5, d, buf + 0x5 * cols);
|
||||
StoreU(v6, d, buf + 0x6 * cols);
|
||||
StoreU(v7, d, buf + 0x7 * cols);
|
||||
StoreU(v8, d, buf + 0x8 * cols);
|
||||
StoreU(v9, d, buf + 0x9 * cols);
|
||||
StoreU(va, d, buf + 0xa * cols);
|
||||
StoreU(vb, d, buf + 0xb * cols);
|
||||
StoreU(vc, d, buf + 0xc * cols);
|
||||
StoreU(vd, d, buf + 0xd * cols);
|
||||
StoreU(ve, d, buf + 0xe * cols);
|
||||
StoreU(vf, d, buf + 0xf * cols);
|
||||
}
|
||||
|
||||
#else
|
||||
template <class Base>
|
||||
struct SharedTraits : public Base {};
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
@@ -0,0 +1,527 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
|
||||
#include "hwy/contrib/sort/vqsort.h" // SortDescending
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/print.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
|
||||
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
|
||||
// along with an abstraction layer for single-lane vs. lane-pair, which is
|
||||
// independent of the order.
|
||||
template <typename T>
|
||||
struct KeyLane {
|
||||
static constexpr bool Is128() { return false; }
|
||||
constexpr size_t LanesPerKey() const { return 1; }
|
||||
|
||||
// What type bench_sort should allocate for generating inputs.
|
||||
using LaneType = T;
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = T;
|
||||
|
||||
std::string KeyString() const {
|
||||
char string100[100];
|
||||
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
|
||||
return string100;
|
||||
}
|
||||
|
||||
// For HeapSort
|
||||
HWY_INLINE void Swap(T* a, T* b) const {
|
||||
const T temp = *a;
|
||||
*a = *b;
|
||||
*b = temp;
|
||||
}
|
||||
|
||||
template <class V, class M>
|
||||
HWY_INLINE V CompressKeys(V keys, M mask) const {
|
||||
return CompressNot(keys, mask);
|
||||
}
|
||||
|
||||
// Broadcasts one key into a vector
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
|
||||
return Set(d, *key);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Eq(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> NotEqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Ne(a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
|
||||
return Reverse(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
|
||||
return Reverse2(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
|
||||
return Reverse4(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
|
||||
return Reverse8(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
|
||||
static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
|
||||
return ReverseKeys(d, v);
|
||||
}
|
||||
|
||||
template <class V>
|
||||
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
|
||||
return OddEven(odd, even);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
|
||||
const Repartition<uint32_t, D> du32;
|
||||
return BitCast(d, Shuffle2301(BitCast(du32, v)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
|
||||
return Shuffle1032(v);
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
|
||||
return SwapAdjacentBlocks(v);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
|
||||
#if HWY_HAVE_FLOAT64 // in case D is float32
|
||||
const RepartitionToWide<D> dw;
|
||||
#else
|
||||
const RepartitionToWide<RebindToUnsigned<D> > dw;
|
||||
#endif
|
||||
return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
|
||||
// Assumes max vector size = 512
|
||||
return ConcatLowerUpper(d, v, v);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
|
||||
const Vec<D> even) const {
|
||||
#if HWY_HAVE_FLOAT64 // in case D is float32
|
||||
const RepartitionToWide<D> dw;
|
||||
#else
|
||||
const RepartitionToWide<RebindToUnsigned<D> > dw;
|
||||
#endif
|
||||
return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
|
||||
return OddEvenBlocks(odd, even);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
|
||||
#if HWY_HAVE_FLOAT64 // in case D is float32
|
||||
const RepartitionToWide<D> dw;
|
||||
#else
|
||||
const RepartitionToWide<RebindToUnsigned<D> > dw;
|
||||
#endif
|
||||
return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
|
||||
return ConcatUpperLower(d, odd, even);
|
||||
}
|
||||
};
|
||||
|
||||
// Anything order-related depends on the key traits *and* the order (see
|
||||
// FirstOfLanes). We cannot implement just one Compare function because Lt128
|
||||
// only compiles if the lane type is u64. Thus we need either overloaded
|
||||
// functions with a tag type, class specializations, or separate classes.
|
||||
// We avoid overloaded functions because we want all functions to be callable
|
||||
// from a SortTraits without per-function wrappers. Specializing would work, but
|
||||
// we are anyway going to specialize at a higher level.
|
||||
template <typename T>
|
||||
struct OrderAscending : public KeyLane<T> {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(a, b);
|
||||
}
|
||||
|
||||
// Two halves of Sort2, used in ScanMinMax.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<T>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<T>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Sub(v, Set(d, hwy::Epsilon<T>()));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct OrderDescending : public KeyLane<T> {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(b, a);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<T>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<T>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Add(v, Set(d, hwy::Epsilon<T>()));
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderAscendingKV64 : public KeyLane<uint64_t> {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (*a >> 32) < (*b >> 32);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(ShiftRight<32>(a), ShiftRight<32>(b));
|
||||
}
|
||||
|
||||
// Not required to be stable (preserving the order of equivalent keys), so
|
||||
// we can include the value in the comparison.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
// Same as for regular lanes.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Sub(v, Set(d, 1));
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescendingKV64 : public KeyLane<uint64_t> {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (*b >> 32) < (*a >> 32);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(ShiftRight<32>(b), ShiftRight<32>(a));
|
||||
}
|
||||
|
||||
// Not required to be stable (preserving the order of equivalent keys), so
|
||||
// we can include the value in the comparison.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
uint64_t* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
return Add(v, Set(d, 1));
|
||||
}
|
||||
};
|
||||
|
||||
// Shared code that depends on Order.
|
||||
template <class Base>
|
||||
struct TraitsLane : public Base {
|
||||
// For each lane i: replaces a[i] with the first and b[i] with the second
|
||||
// according to Base.
|
||||
// Corresponds to a conditional swap, which is one "node" of a sorting
|
||||
// network. Min/Max are cheaper than compare + blend at least for integers.
|
||||
template <class D>
|
||||
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
|
||||
const Vec<D> a_copy = a;
|
||||
// Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
|
||||
// instructions. We can reduce it to a compare + 2 IfThenElse.
|
||||
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
|
||||
if (sizeof(TFromD<D>) == 8) {
|
||||
const Mask<D> cmp = base->Compare(d, a, b);
|
||||
a = IfThenElse(cmp, a, b);
|
||||
b = IfThenElse(cmp, b, a_copy);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
a = base->First(d, a, b);
|
||||
b = base->Last(d, a_copy, b);
|
||||
}
|
||||
|
||||
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys2(d, v);
|
||||
// Further to the above optimization, Sort2+OddEvenKeys compile to four
|
||||
// instructions; we can save one by combining two blends.
|
||||
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
|
||||
const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
|
||||
return IfVecThenElse(DupOdd(cmp), swapped, v);
|
||||
#else
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenKeys(swapped, v);
|
||||
#endif
|
||||
}
|
||||
|
||||
// (See above - we use Sort2 for non-64-bit types.)
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys2(d, v);
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenKeys(swapped, v);
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys4(d, v);
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenPairs(d, swapped, v);
|
||||
}
|
||||
|
||||
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->SwapAdjacentQuads(d, v);
|
||||
// Only used in Merge16, so this will not be used on AVX2 (which only has 4
|
||||
// u64 lanes), so skip the above optimization for 64-bit AVX2.
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenQuads(d, swapped, v);
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
// Base class shared between OrderAscending, OrderDescending.
|
||||
template <typename T>
|
||||
struct KeyLane {
|
||||
constexpr bool Is128() const { return false; }
|
||||
constexpr size_t LanesPerKey() const { return 1; }
|
||||
|
||||
using LaneType = T;
|
||||
using KeyType = T;
|
||||
|
||||
std::string KeyString() const {
|
||||
char string100[100];
|
||||
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
|
||||
return string100;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct OrderAscending : public KeyLane<T> {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
|
||||
return Lt(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct OrderDescending : public KeyLane<T> {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
|
||||
return Lt(b, a);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Order>
|
||||
struct TraitsLane : public Order {
|
||||
// For HeapSort
|
||||
template <typename T> // MSVC doesn't find typename Order::LaneType.
|
||||
HWY_INLINE void Swap(T* a, T* b) const {
|
||||
const T temp = *a;
|
||||
*a = *b;
|
||||
*b = temp;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
|
||||
return Set(d, *key);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
@@ -0,0 +1,492 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "hwy/contrib/sort/shared-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort.h" // SortDescending
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
|
||||
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
|
||||
// along with an abstraction layer for single-lane vs. lane-pair, which is
|
||||
// independent of the order.
|
||||
struct KeyAny128 {
|
||||
static constexpr bool Is128() { return true; }
|
||||
constexpr size_t LanesPerKey() const { return 2; }
|
||||
|
||||
// What type bench_sort should allocate for generating inputs.
|
||||
using LaneType = uint64_t;
|
||||
// KeyType and KeyString are defined by derived classes.
|
||||
|
||||
HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
|
||||
const FixedTag<LaneType, 2> d;
|
||||
const auto temp = LoadU(d, a);
|
||||
StoreU(LoadU(d, b), d, a);
|
||||
StoreU(temp, d, b);
|
||||
}
|
||||
|
||||
template <class V, class M>
|
||||
HWY_INLINE V CompressKeys(V keys, M mask) const {
|
||||
return CompressBlocksNot(keys, mask);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
|
||||
return LoadDup128(d, key);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
|
||||
return ReverseBlocks(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
|
||||
return SwapAdjacentBlocks(v);
|
||||
}
|
||||
|
||||
// Only called for 4 keys because we do not support >512-bit vectors.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
|
||||
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
|
||||
return ReverseKeys(d, v);
|
||||
}
|
||||
|
||||
// Only called for 4 keys because we do not support >512-bit vectors.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
|
||||
const Vec<D> even) const {
|
||||
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
|
||||
return ConcatUpperLower(d, odd, even);
|
||||
}
|
||||
|
||||
template <class V>
|
||||
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
|
||||
return OddEvenBlocks(odd, even);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
|
||||
HWY_ASSERT(0); // not supported: would require 1024-bit vectors
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
|
||||
HWY_ASSERT(0); // not supported: would require 2048-bit vectors
|
||||
}
|
||||
|
||||
// This is only called for 8/16 col networks (not supported).
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// This is only called for 16 col networks (not supported).
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// This is only called for 8 col networks (not supported).
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
};
|
||||
|
||||
// Base class shared between OrderAscending128, OrderDescending128.
|
||||
struct Key128 : public KeyAny128 {
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = hwy::uint128_t;
|
||||
|
||||
std::string KeyString() const { return "U128"; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Eq128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Ne128(d, a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
|
||||
return a[0] == b[0] && a[1] == b[1];
|
||||
}
|
||||
};
|
||||
|
||||
// Anything order-related depends on the key traits *and* the order (see
|
||||
// FirstOfLanes). We cannot implement just one Compare function because Lt128
|
||||
// only compiles if the lane type is u64. Thus we need either overloaded
|
||||
// functions with a tag type, class specializations, or separate classes.
|
||||
// We avoid overloaded functions because we want all functions to be callable
|
||||
// from a SortTraits without per-function wrappers. Specializing would work, but
|
||||
// we are anyway going to specialize at a higher level.
|
||||
struct OrderAscending128 : public Key128 {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128(d, a, b);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128(d, a, b);
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k0 = Zero(d);
|
||||
const Vec<D> k1 = OddEven(k0, Set(d, 1));
|
||||
const Mask<D> borrow = Eq(v, k0); // don't-care, lo == 0
|
||||
// lo == 0? 1 : 0, 0
|
||||
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(borrow, k1));
|
||||
return Sub(Sub(v, k1), adjust);
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescending128 : public Key128 {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128(d, b, a);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(b, a);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128(d, a, b);
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
|
||||
const Vec<D> added = Add(v, k1);
|
||||
const Mask<D> overflowed = Lt(added, v); // false, overflowed
|
||||
// overflowed? 1 : 0, 0
|
||||
const Vec<D> adjust = ShiftLeftLanes<1>(IfThenElseZero(overflowed, k1));
|
||||
return Add(added, adjust);
|
||||
}
|
||||
};
|
||||
|
||||
// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
|
||||
struct KeyValue128 : public KeyAny128 {
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = K64V64;
|
||||
|
||||
std::string KeyString() const { return "KV128"; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Eq128Upper(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> NotEqualKeys(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Ne128Upper(d, a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
|
||||
return a[1] == b[1];
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderAscendingKV128 : public KeyValue128 {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return a[1] < b[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128Upper(d, a, b);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128Upper(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128Upper(d, a, b);
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
|
||||
return Sub(v, k1);
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescendingKV128 : public KeyValue128 {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return b[1] < a[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128Upper(d, b, a);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(b, a);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128Upper(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128Upper(d, a, b);
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
|
||||
const Vec<D> k1 = OddEven(Set(d, 1), Zero(d));
|
||||
return Add(v, k1);
|
||||
}
|
||||
};
|
||||
|
||||
// Shared code that depends on Order.
|
||||
template <class Base>
|
||||
class Traits128 : public Base {
|
||||
// Special case for >= 256 bit vectors
|
||||
#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
|
||||
// Returns vector with only the top u64 lane valid. Useful when the next step
|
||||
// is to replicate the mask anyway.
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
const Mask<D> eqHL = Eq(a, b);
|
||||
const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
|
||||
#if HWY_TARGET == HWY_SVE_256
|
||||
return IfThenElse(eqHL, DupEven(ltHL), ltHL);
|
||||
#else
|
||||
const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
|
||||
return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
|
||||
#endif
|
||||
}
|
||||
|
||||
// We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
|
||||
// the most-significant of those lanes (the result of CompareTop), so
|
||||
// replicate it 4x. Only called for >= 256-bit vectors.
|
||||
template <class V>
|
||||
HWY_INLINE V ReplicateTop4x(V v) const {
|
||||
#if HWY_TARGET == HWY_SVE_256
|
||||
return svdup_lane_u64(v, 3);
|
||||
#elif HWY_TARGET <= HWY_AVX3
|
||||
return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
|
||||
#else // AVX2
|
||||
return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
|
||||
#endif
|
||||
}
|
||||
#endif // HWY_TARGET
|
||||
|
||||
public:
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = base->SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
|
||||
v = base->First(d, v, base->SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = base->SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
|
||||
v = base->Last(d, v, base->SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
|
||||
const Vec<D> a_copy = a;
|
||||
const auto lt = base->Compare(d, a, b);
|
||||
a = IfThenElse(lt, a, b);
|
||||
b = IfThenElse(lt, b, a_copy);
|
||||
}
|
||||
|
||||
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys2(d, v);
|
||||
|
||||
#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
|
||||
const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
|
||||
return IfVecThenElse(select, swapped, v);
|
||||
#else
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenKeys(swapped, v);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys4(d, v);
|
||||
|
||||
// Only specialize for AVX3 because this requires 512-bit vectors.
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
const Vec512<uint64_t> outHx = CompareTop(d, v, swapped);
|
||||
// Similar to ReplicateTop4x, we want to gang together 2 comparison results
|
||||
// (4 lanes). They are not contiguous, so use permute to replicate 4x.
|
||||
alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
|
||||
const Vec512<uint64_t> select =
|
||||
TableLookupLanes(outHx, SetTableIndices(d, kIndices));
|
||||
return IfVecThenElse(select, swapped, v);
|
||||
#else
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenPairs(d, swapped, v);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
|
||||
// Only used by Merge16, which would require 2048 bit vectors (unsupported).
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,184 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#include <string.h> // memset
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/shared-inl.h"
|
||||
|
||||
// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an
|
||||
// optimization that replaces dynamic allocation with stack storage.
|
||||
#ifndef VQSORT_STACK
|
||||
#if HWY_ARCH_X86 || HWY_ARCH_WASM
|
||||
#define VQSORT_STACK 1
|
||||
#else
|
||||
#define VQSORT_STACK 0
|
||||
#endif
|
||||
#endif // VQSORT_STACK
|
||||
|
||||
#if !VQSORT_STACK
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#endif
|
||||
|
||||
// Check if we have sys/random.h. First skip some systems on which the check
|
||||
// itself (features.h) might be problematic.
|
||||
#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
|
||||
#define VQSORT_GETRANDOM 0
|
||||
#endif
|
||||
|
||||
#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
|
||||
#include <features.h>
|
||||
|
||||
// ---- which libc
|
||||
#if defined(__UCLIBC__)
|
||||
#define VQSORT_GETRANDOM 1 // added Mar 2015, before uclibc-ng 1.0
|
||||
|
||||
#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ)
|
||||
#if __GLIBC_PREREQ(2, 25)
|
||||
#define VQSORT_GETRANDOM 1
|
||||
#else
|
||||
#define VQSORT_GETRANDOM 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
// Assume MUSL, which has getrandom since 2018. There is no macro to test, see
|
||||
// https://www.openwall.com/lists/musl/2013/03/29/13.
|
||||
#define VQSORT_GETRANDOM 1
|
||||
|
||||
#endif // ---- which libc
|
||||
#endif // linux
|
||||
|
||||
#if !defined(VQSORT_GETRANDOM)
|
||||
#define VQSORT_GETRANDOM 0
|
||||
#endif
|
||||
|
||||
// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
|
||||
// (not all Android support the getrandom wrapper)
|
||||
#ifndef VQSORT_SECURE_SEED
|
||||
|
||||
#if VQSORT_GETRANDOM
|
||||
#define VQSORT_SECURE_SEED 1
|
||||
#elif defined(_WIN32) || defined(_WIN64)
|
||||
#define VQSORT_SECURE_SEED 2
|
||||
#else
|
||||
#define VQSORT_SECURE_SEED 0
|
||||
#endif
|
||||
|
||||
#endif // VQSORT_SECURE_SEED
|
||||
|
||||
#if !VQSORT_SECURE_RNG
|
||||
|
||||
#include <time.h>
|
||||
#if VQSORT_SECURE_SEED == 1
|
||||
#include <sys/random.h>
|
||||
#elif VQSORT_SECURE_SEED == 2
|
||||
#include <windows.h>
|
||||
#pragma comment(lib, "advapi32.lib")
|
||||
// Must come after windows.h.
|
||||
#include <wincrypt.h>
|
||||
#endif // VQSORT_SECURE_SEED
|
||||
|
||||
#endif // !VQSORT_SECURE_RNG
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); }
|
||||
bool HaveFloat64() { return HWY_HAVE_FLOAT64; }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(VectorSize);
|
||||
HWY_EXPORT(HaveFloat64);
|
||||
|
||||
} // namespace
|
||||
|
||||
Sorter::Sorter() {
|
||||
#if VQSORT_STACK
|
||||
ptr_ = nullptr; // Sort will use stack storage instead
|
||||
#else
|
||||
// Determine the largest buffer size required for any type by trying them all.
|
||||
// (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
|
||||
// may require a larger buffer.)
|
||||
const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
|
||||
const size_t max_bytes =
|
||||
HWY_MAX(HWY_MAX(SortConstants::BufBytes<uint16_t>(vector_size),
|
||||
SortConstants::BufBytes<uint32_t>(vector_size)),
|
||||
SortConstants::BufBytes<uint64_t>(vector_size));
|
||||
ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
|
||||
|
||||
// Prevent msan errors by initializing.
|
||||
memset(ptr_, 0, max_bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Sorter::Delete() {
|
||||
#if !VQSORT_STACK
|
||||
FreeAlignedBytes(ptr_, nullptr, nullptr);
|
||||
ptr_ = nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !VQSORT_SECURE_RNG
|
||||
|
||||
void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) {
|
||||
#if VQSORT_SECURE_SEED == 1
|
||||
// May block if urandom is not yet initialized.
|
||||
const ssize_t ret = getrandom(bytes, 24, /*flags=*/0);
|
||||
if (ret == 24) return;
|
||||
#elif VQSORT_SECURE_SEED == 2
|
||||
HCRYPTPROV hProvider{};
|
||||
if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
|
||||
CRYPT_VERIFYCONTEXT)) {
|
||||
const BOOL ok =
|
||||
CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes));
|
||||
CryptReleaseContext(hProvider, 0);
|
||||
if (ok) return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
|
||||
// stack/heap/code addresses and the clock() timer.
|
||||
uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
|
||||
uint64_t** seed_stack = &words;
|
||||
void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes;
|
||||
const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
|
||||
const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap);
|
||||
const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
|
||||
const uint64_t bits_time = static_cast<uint64_t>(clock());
|
||||
words[0] = bits_stack ^ bits_time ^ seed_num;
|
||||
words[1] = bits_heap ^ bits_time ^ seed_num;
|
||||
words[2] = bits_code ^ bits_time ^ seed_num;
|
||||
}
|
||||
|
||||
#endif // !VQSORT_SECURE_RNG
|
||||
|
||||
bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); }
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,108 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Interface to vectorized quicksort with dynamic dispatch.
|
||||
// Blog post: https://tinyurl.com/vqsort-blog
|
||||
// Paper with measurements: https://arxiv.org/abs/2205.05982
|
||||
//
|
||||
// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
|
||||
// worthwhile, we recommend using this code for sorting arrays whose size is at
|
||||
// least 512 KiB.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Tag arguments that determine the sort order.
|
||||
struct SortAscending {
|
||||
constexpr bool IsAscending() const { return true; }
|
||||
};
|
||||
struct SortDescending {
|
||||
constexpr bool IsAscending() const { return false; }
|
||||
};
|
||||
|
||||
// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
|
||||
// This allows amortizing the allocation over multiple sorts.
|
||||
class HWY_CONTRIB_DLLEXPORT Sorter {
|
||||
public:
|
||||
Sorter();
|
||||
~Sorter() { Delete(); }
|
||||
|
||||
// Move-only
|
||||
Sorter(const Sorter&) = delete;
|
||||
Sorter& operator=(const Sorter&) = delete;
|
||||
Sorter(Sorter&& other) {
|
||||
Delete();
|
||||
ptr_ = other.ptr_;
|
||||
other.ptr_ = nullptr;
|
||||
}
|
||||
Sorter& operator=(Sorter&& other) {
|
||||
Delete();
|
||||
ptr_ = other.ptr_;
|
||||
other.ptr_ = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Sorts keys[0, n). Dispatches to the best available instruction set,
|
||||
// and does not allocate memory.
|
||||
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(K32V32* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
// For internal use only
|
||||
static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
|
||||
static bool HaveFloat64();
|
||||
|
||||
private:
|
||||
void Delete();
|
||||
|
||||
template <typename T>
|
||||
T* Get() const {
|
||||
return static_cast<T*>(ptr_);
|
||||
}
|
||||
|
||||
void* ptr_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
||||
@@ -0,0 +1,62 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(Sort128Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(Sort128Asc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,62 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(Sort128Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(Sort128Desc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,53 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
|
||||
SortTag<float> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF32Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
|
||||
float* HWY_RESTRICT buf) {
|
||||
SortTag<float> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF32Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,61 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
|
||||
double* HWY_RESTRICT buf) {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
SortTag<double> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void)keys;
|
||||
(void)num;
|
||||
(void)buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,61 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
|
||||
double* HWY_RESTRICT buf) {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
SortTag<double> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void)keys;
|
||||
(void)num;
|
||||
(void)buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
|
||||
int16_t* HWY_RESTRICT buf) {
|
||||
SortTag<int16_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI16Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
|
||||
int16_t* HWY_RESTRICT buf) {
|
||||
SortTag<int16_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI16Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
|
||||
int32_t* HWY_RESTRICT buf) {
|
||||
SortTag<int32_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI32Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
|
||||
int32_t* HWY_RESTRICT buf) {
|
||||
SortTag<int32_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI32Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
|
||||
int64_t* HWY_RESTRICT buf) {
|
||||
SortTag<int64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
|
||||
int64_t* HWY_RESTRICT buf) {
|
||||
SortTag<int64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,65 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV128Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV128Asc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,65 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV128Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV128Desc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,65 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64a.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscendingKV64>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV64Asc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,65 @@
|
||||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv64d.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescendingKV64>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K32V32* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV64Desc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
|
||||
uint16_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint16_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU16Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,55 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
|
||||
uint16_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint16_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
|
||||
st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU16Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
|
||||
uint32_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint32_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU32Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,55 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
|
||||
uint32_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint32_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
|
||||
st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU32Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,54 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,55 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
|
||||
st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,234 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
|
||||
// Detects compiler and arch from predefined macros. Zero dependencies for
|
||||
// inclusion by foreach_target.h.
|
||||
|
||||
// Add to #if conditions to prevent IDE from graying out code.
|
||||
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
|
||||
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
|
||||
#define HWY_IDE 1
|
||||
#else
|
||||
#define HWY_IDE 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler
|
||||
|
||||
// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
|
||||
// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define HWY_COMPILER_MSVC _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_MSVC 0
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && defined(__clang__)
|
||||
#define HWY_COMPILER_CLANGCL _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_CLANGCL 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define HWY_COMPILER_ICC __INTEL_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICC 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_LLVM_COMPILER
|
||||
#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICX 0
|
||||
#endif
|
||||
|
||||
// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
|
||||
// compiler extensions (eg. Clang, Intel...)
|
||||
#ifdef __GNUC__
|
||||
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
|
||||
#else
|
||||
#define HWY_COMPILER_GCC 0
|
||||
#endif
|
||||
|
||||
// Clang or clang-cl, not GCC.
|
||||
#ifdef __clang__
|
||||
// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
|
||||
// an invalid version number, deduce it from the presence of warnings.
|
||||
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
|
||||
#if defined(__apple_build_version__) || __clang_major__ >= 999
|
||||
#if __has_warning("-Wbitwise-instead-of-logical")
|
||||
#define HWY_COMPILER_CLANG 1400
|
||||
#elif __has_warning("-Wreserved-identifier")
|
||||
#define HWY_COMPILER_CLANG 1300
|
||||
#elif __has_warning("-Wformat-insufficient-args")
|
||||
#define HWY_COMPILER_CLANG 1200
|
||||
#elif __has_warning("-Wimplicit-const-int-float-conversion")
|
||||
#define HWY_COMPILER_CLANG 1100
|
||||
#elif __has_warning("-Wmisleading-indentation")
|
||||
#define HWY_COMPILER_CLANG 1000
|
||||
#elif defined(__FILE_NAME__)
|
||||
#define HWY_COMPILER_CLANG 900
|
||||
#elif __has_warning("-Wextra-semi-stmt") || \
|
||||
__has_builtin(__builtin_rotateleft32)
|
||||
#define HWY_COMPILER_CLANG 800
|
||||
// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently
|
||||
// based on Clang 7, but does not support the warning we test.
|
||||
// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and
|
||||
// https://trac.macports.org/wiki/XcodeVersionInfo.
|
||||
#elif __has_warning("-Wc++98-compat-extra-semi") || \
|
||||
(defined(__apple_build_version__) && __apple_build_version__ >= 10010000)
|
||||
#define HWY_COMPILER_CLANG 700
|
||||
#else // Anything older than 7.0 is not recommended for Highway.
|
||||
#define HWY_COMPILER_CLANG 600
|
||||
#endif // __has_warning chain
|
||||
#else // use normal version
|
||||
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
|
||||
#endif
|
||||
#else // Not clang
|
||||
#define HWY_COMPILER_CLANG 0
|
||||
#endif
|
||||
|
||||
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
|
||||
#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
|
||||
#else
|
||||
#define HWY_COMPILER_GCC_ACTUAL 0
|
||||
#endif
|
||||
|
||||
// More than one may be nonzero, but we want at least one.
|
||||
#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
|
||||
HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
|
||||
// We should only detect one of these (only clang/clangcl overlap)
|
||||
#if 1 < \
|
||||
(!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
|
||||
!!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
|
||||
#error "Detected multiple compilers"
|
||||
#endif
|
||||
|
||||
#ifdef __has_builtin
|
||||
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
|
||||
#else
|
||||
#define HWY_HAS_BUILTIN(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_attribute
|
||||
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
|
||||
#else
|
||||
#define HWY_HAS_ATTRIBUTE(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_feature
|
||||
#define HWY_HAS_FEATURE(name) __has_feature(name)
|
||||
#else
|
||||
#define HWY_HAS_FEATURE(name) 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Architecture
|
||||
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#define HWY_ARCH_X86_32 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_32 0
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define HWY_ARCH_X86_64 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_64 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
|
||||
#error "Cannot have both x86-32 and x86-64"
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
|
||||
#define HWY_ARCH_X86 1
|
||||
#else
|
||||
#define HWY_ARCH_X86 0
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc64__) || defined(_M_PPC)
|
||||
#define HWY_ARCH_PPC 1
|
||||
#else
|
||||
#define HWY_ARCH_PPC 0
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define HWY_ARCH_ARM_A64 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_A64 0
|
||||
#endif
|
||||
|
||||
#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
|
||||
#define HWY_ARCH_ARM_V7 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
|
||||
#error "Cannot have both A64 and V7"
|
||||
#endif
|
||||
|
||||
// Any *supported* version of Arm, i.e. 7 or later
|
||||
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
|
||||
#define HWY_ARCH_ARM 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM 0
|
||||
#endif
|
||||
|
||||
// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
|
||||
#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
|
||||
#define HWY_ARCH_ARM_OLD 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_OLD 0
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
|
||||
#define HWY_ARCH_WASM 1
|
||||
#else
|
||||
#define HWY_ARCH_WASM 0
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#define HWY_ARCH_RVV 1
|
||||
#else
|
||||
#define HWY_ARCH_RVV 0
|
||||
#endif
|
||||
|
||||
// It is an error to detect multiple architectures at the same time, but OK to
|
||||
// detect none of the above.
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
|
||||
HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
|
||||
#error "Must not detect more than one architecture"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define HWY_OS_WIN 1
|
||||
#else
|
||||
#define HWY_OS_WIN 0
|
||||
#endif
|
||||
|
||||
#if defined(linux) || defined(__linux__)
|
||||
#define HWY_OS_LINUX 1
|
||||
#else
|
||||
#define HWY_OS_LINUX 0
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
@@ -0,0 +1,478 @@
|
||||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
|
||||
#define HIGHWAY_HWY_DETECT_TARGETS_H_
|
||||
|
||||
// Defines targets and chooses which to enable.
|
||||
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Optional configuration
|
||||
|
||||
// See g3doc/quick_reference.md for documentation of these macros.
|
||||
|
||||
// Uncomment to override the default baseline determined from predefined macros:
|
||||
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
|
||||
|
||||
// Uncomment to override the default blocklist:
|
||||
// #define HWY_BROKEN_TARGETS HWY_AVX3
|
||||
|
||||
// Uncomment to definitely avoid generating those target(s):
|
||||
// #define HWY_DISABLED_TARGETS HWY_SSE4
|
||||
|
||||
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
|
||||
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
|
||||
// #define HWY_DISABLE_BMI2_FMA
|
||||
|
||||
// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
|
||||
// #define HWY_WANT_SSSE3
|
||||
// #define HWY_WANT_SSE4
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Targets
|
||||
|
||||
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
|
||||
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
|
||||
//
|
||||
// All values are unconditionally defined so we can test HWY_TARGETS without
|
||||
// first checking the HWY_ARCH_*.
|
||||
//
|
||||
// The C99 preprocessor evaluates #if expressions using intmax_t types. This
|
||||
// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
|
||||
// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
|
||||
// avoid overflow when computing HWY_TARGETS (subtracting one instead of
|
||||
// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
|
||||
|
||||
// --------------------------- x86: 15 targets (+ one fallback)
|
||||
// Bits 0..6 reserved (7 targets)
|
||||
// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
|
||||
// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
|
||||
// Tiger Lake? We do not yet have uses for GFNI.
|
||||
#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
|
||||
#define HWY_AVX3 (1LL << 8)
|
||||
#define HWY_AVX2 (1LL << 9)
|
||||
// Bit 10: reserved for AVX
|
||||
#define HWY_SSE4 (1LL << 11)
|
||||
#define HWY_SSSE3 (1LL << 12)
|
||||
// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
|
||||
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
|
||||
// dynamic dispatch. All x86 target bits must be lower or equal to
|
||||
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
|
||||
// HWY_MAX_DYNAMIC_TARGETS in total.
|
||||
#define HWY_HIGHEST_TARGET_BIT_X86 14
|
||||
|
||||
// --------------------------- Arm: 15 targets (+ one fallback)
|
||||
// Bits 15..23 reserved (9 targets)
|
||||
#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2)
|
||||
#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1)
|
||||
#define HWY_SVE2 (1LL << 26)
|
||||
#define HWY_SVE (1LL << 27)
|
||||
#define HWY_NEON (1LL << 28) // On A64, includes/requires AES
|
||||
// Bit 29 reserved (Helium?)
|
||||
#define HWY_HIGHEST_TARGET_BIT_ARM 29
|
||||
|
||||
// --------------------------- RISC-V: 9 targets (+ one fallback)
|
||||
// Bits 30..36 reserved (7 targets)
|
||||
#define HWY_RVV (1LL << 37)
|
||||
// Bit 38 reserved
|
||||
#define HWY_HIGHEST_TARGET_BIT_RVV 38
|
||||
|
||||
// --------------------------- Future expansion: 4 targets
|
||||
// Bits 39..42 reserved
|
||||
|
||||
|
||||
// --------------------------- IBM Power: 9 targets (+ one fallback)
|
||||
// Bits 43..48 reserved (6 targets)
|
||||
#define HWY_PPC8 (1LL << 49) // v2.07 or 3
|
||||
// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
|
||||
#define HWY_HIGHEST_TARGET_BIT_PPC 51
|
||||
|
||||
// --------------------------- WebAssembly: 9 targets (+ one fallback)
|
||||
// Bits 52..57 reserved (6 targets)
|
||||
#define HWY_WASM_EMU256 (1LL << 58) // Experimental
|
||||
#define HWY_WASM (1LL << 59)
|
||||
// Bits 60 reserved
|
||||
#define HWY_HIGHEST_TARGET_BIT_WASM 60
|
||||
|
||||
// --------------------------- Emulation: 2 targets
|
||||
|
||||
#define HWY_EMU128 (1LL << 61)
|
||||
// We do not add/left-shift, so this will not overflow to a negative number.
|
||||
#define HWY_SCALAR (1LL << 62)
|
||||
#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
|
||||
|
||||
// Do not use bit 63 - would be confusing to have negative numbers.
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set default blocklists
|
||||
|
||||
// Disabled means excluded from enabled at user's request. A separate config
|
||||
// macro allows disabling without deactivating the blocklist below.
|
||||
#ifndef HWY_DISABLED_TARGETS
|
||||
#define HWY_DISABLED_TARGETS 0
|
||||
#endif
|
||||
|
||||
// Broken means excluded from enabled due to known compiler issues. Allow the
|
||||
// user to override this blocklist without any guarantee of success.
|
||||
#ifndef HWY_BROKEN_TARGETS
|
||||
|
||||
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
|
||||
// SSE4 codegen (possibly only for msan), so disable all those targets.
|
||||
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
|
||||
#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
|
||||
// This entails a major speed reduction, so warn unless the user explicitly
|
||||
// opts in to scalar-only.
|
||||
#if !defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
|
||||
#endif
|
||||
|
||||
// 32-bit may fail to compile AVX2/3.
|
||||
#elif HWY_ARCH_X86_32
|
||||
#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
|
||||
|
||||
// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
|
||||
#elif HWY_COMPILER_MSVC != 0
|
||||
#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
|
||||
|
||||
// armv7be has not been tested and is not yet supported.
|
||||
#elif HWY_ARCH_ARM_V7 && \
|
||||
(defined(__ARM_BIG_ENDIAN) || \
|
||||
(defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
|
||||
#define HWY_BROKEN_TARGETS (HWY_NEON)
|
||||
|
||||
// SVE[2] require recent clang or gcc versions.
|
||||
#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
|
||||
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
|
||||
#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
|
||||
|
||||
#else
|
||||
#define HWY_BROKEN_TARGETS 0
|
||||
#endif
|
||||
|
||||
#endif // HWY_BROKEN_TARGETS
|
||||
|
||||
// Enabled means not disabled nor blocklisted.
|
||||
#define HWY_ENABLED(targets) \
|
||||
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
|
||||
|
||||
// Opt-out for EMU128 (affected by a GCC bug on multiple arches, fixed in 12.3:
|
||||
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322). This is separate
|
||||
// from HWY_BROKEN_TARGETS because it affects the fallback target, which must
|
||||
// always be enabled. If 1, we instead choose HWY_SCALAR even without
|
||||
// HWY_COMPILE_ONLY_SCALAR being set.
|
||||
#if !defined(HWY_BROKEN_EMU128) // allow overriding
|
||||
#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1203
|
||||
#define HWY_BROKEN_EMU128 1
|
||||
#else
|
||||
#define HWY_BROKEN_EMU128 0
|
||||
#endif
|
||||
#endif // HWY_BROKEN_EMU128
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect baseline targets using predefined macros
|
||||
|
||||
// Baseline means the targets for which the compiler is allowed to generate
|
||||
// instructions, implying the target CPU would have to support them. This does
|
||||
// not take the blocklist into account.
|
||||
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
|
||||
#define HWY_BASELINE_SCALAR HWY_SCALAR
|
||||
#else
|
||||
#define HWY_BASELINE_SCALAR HWY_EMU128
|
||||
#endif
|
||||
|
||||
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
|
||||
// HWY_TARGET == HWY_BASELINE_SCALAR.
|
||||
|
||||
#if HWY_ARCH_WASM && defined(__wasm_simd128__)
|
||||
#if defined(HWY_WANT_WASM2)
|
||||
#define HWY_BASELINE_WASM HWY_WASM_EMU256
|
||||
#else
|
||||
#define HWY_BASELINE_WASM HWY_WASM
|
||||
#endif // HWY_WANT_WASM2
|
||||
#else
|
||||
#define HWY_BASELINE_WASM 0
|
||||
#endif
|
||||
|
||||
// Avoid choosing the PPC target until we have an implementation.
|
||||
#if HWY_ARCH_PPC && defined(__VSX__) && 0
|
||||
#define HWY_BASELINE_PPC8 HWY_PPC8
|
||||
#else
|
||||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
#define HWY_BASELINE_SVE2 0
|
||||
#define HWY_BASELINE_SVE 0
|
||||
#define HWY_BASELINE_NEON 0
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE2)
|
||||
#undef HWY_BASELINE_SVE2 // was 0, will be re-defined
|
||||
// If user specified -msve-vector-bits=128, they assert the vector length is
|
||||
// 128 bits and we should use the HWY_SVE2_128 (more efficient for some ops).
|
||||
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 128
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2_128
|
||||
// Otherwise we're not sure what the vector length will be. The baseline must be
|
||||
// unconditionally valid, so we can only assume HWY_SVE2. However, when running
|
||||
// on a CPU with 128-bit vectors, user code that supports dynamic dispatch will
|
||||
// still benefit from HWY_SVE2_128 because we add it to HWY_ATTAINABLE_TARGETS.
|
||||
#else
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#endif // __ARM_FEATURE_SVE_BITS
|
||||
#endif // __ARM_FEATURE_SVE2
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#undef HWY_BASELINE_SVE // was 0, will be re-defined
|
||||
// See above. If user-specified vector length matches our optimization, use it.
|
||||
#if defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS == 256
|
||||
#define HWY_BASELINE_SVE HWY_SVE_256
|
||||
#else
|
||||
#define HWY_BASELINE_SVE HWY_SVE
|
||||
#endif // __ARM_FEATURE_SVE_BITS
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#undef HWY_BASELINE_NEON
|
||||
#define HWY_BASELINE_NEON HWY_NEON
|
||||
#endif
|
||||
|
||||
#endif // HWY_ARCH_ARM
|
||||
|
||||
// Special handling for MSVC because it has fewer predefined macros:
|
||||
#if HWY_COMPILER_MSVC
|
||||
|
||||
// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
|
||||
// https://stackoverflow.com/questions/18563978/.
|
||||
#if defined(__AVX__)
|
||||
#define HWY_CHECK_SSSE3 1
|
||||
#define HWY_CHECK_SSE4 1
|
||||
#else
|
||||
#define HWY_CHECK_SSSE3 0
|
||||
#define HWY_CHECK_SSE4 0
|
||||
#endif
|
||||
|
||||
// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
|
||||
// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
|
||||
#define HWY_CHECK_PCLMUL_AES 1
|
||||
#define HWY_CHECK_BMI2_FMA 1
|
||||
#define HWY_CHECK_F16C 1
|
||||
|
||||
#else // non-MSVC
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#define HWY_CHECK_SSSE3 1
|
||||
#else
|
||||
#define HWY_CHECK_SSSE3 0
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) && defined(__SSE4_2__)
|
||||
#define HWY_CHECK_SSE4 1
|
||||
#else
|
||||
#define HWY_CHECK_SSE4 0
|
||||
#endif
|
||||
|
||||
// If these are disabled, they should not gate the availability of SSE4/AVX2.
|
||||
#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
|
||||
#define HWY_CHECK_PCLMUL_AES 1
|
||||
#else
|
||||
#define HWY_CHECK_PCLMUL_AES 0
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
|
||||
#define HWY_CHECK_BMI2_FMA 1
|
||||
#else
|
||||
#define HWY_CHECK_BMI2_FMA 0
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
|
||||
#define HWY_CHECK_F16C 1
|
||||
#else
|
||||
#define HWY_CHECK_F16C 0
|
||||
#endif
|
||||
|
||||
#endif // non-MSVC
|
||||
|
||||
#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
|
||||
#define HWY_BASELINE_SSSE3 HWY_SSSE3
|
||||
#else
|
||||
#define HWY_BASELINE_SSSE3 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
|
||||
#define HWY_BASELINE_SSE4 HWY_SSE4
|
||||
#else
|
||||
#define HWY_BASELINE_SSE4 0
|
||||
#endif
|
||||
|
||||
#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
|
||||
defined(__AVX2__)
|
||||
#define HWY_BASELINE_AVX2 HWY_AVX2
|
||||
#else
|
||||
#define HWY_BASELINE_AVX2 0
|
||||
#endif
|
||||
|
||||
// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
|
||||
#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
|
||||
defined(__AVX512DQ__) && defined(__AVX512VL__)
|
||||
#define HWY_BASELINE_AVX3 HWY_AVX3
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3 0
|
||||
#endif
|
||||
|
||||
// TODO(janwas): not yet known whether these will be set by MSVC
|
||||
#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
|
||||
defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
|
||||
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
|
||||
defined(__AVX512BITALG__)
|
||||
#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3_DL 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
#define HWY_BASELINE_RVV HWY_RVV
|
||||
#else
|
||||
#define HWY_BASELINE_RVV 0
|
||||
#endif
|
||||
|
||||
// Allow the user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
#define HWY_BASELINE_TARGETS \
|
||||
(HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
|
||||
HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON | \
|
||||
HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
|
||||
HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
|
||||
#endif // HWY_BASELINE_TARGETS
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose target for static dispatch
|
||||
|
||||
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
|
||||
#if HWY_ENABLED_BASELINE == 0
|
||||
#error "At least one baseline target must be defined and enabled"
|
||||
#endif
|
||||
|
||||
// Best baseline, used for static dispatch. This is the least-significant 1-bit
|
||||
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
|
||||
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
|
||||
|
||||
// Start by assuming static dispatch. If we later use dynamic dispatch, this
|
||||
// will be defined to other targets during the multiple-inclusion, and finally
|
||||
// return to the initial value. Defining this outside begin/end_target ensures
|
||||
// inl headers successfully compile by themselves (required by Bazel).
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose targets for dynamic dispatch according to one of four policies
|
||||
|
||||
#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
|
||||
defined(HWY_COMPILE_ONLY_STATIC))
|
||||
#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
|
||||
#endif
|
||||
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
|
||||
|
||||
// Clang, GCC and MSVC allow runtime dispatch on x86.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
||||
// On Arm, currently only GCC does, and we require Linux to detect CPU
|
||||
// capabilities.
|
||||
#elif HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
||||
#else
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 0
|
||||
#endif
|
||||
|
||||
// AVX3_DL is not widely available yet. To reduce code size and compile time,
|
||||
// only include it in the set of attainable targets (for dynamic dispatch) if
|
||||
// the user opts in, OR it is in the baseline (we check whether enabled below).
|
||||
#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
|
||||
#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
|
||||
#else
|
||||
#define HWY_ATTAINABLE_AVX3_DL 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
|
||||
(HWY_ENABLED_BASELINE & (HWY_SVE | HWY_SVE_256)))
|
||||
#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && (HWY_HAVE_RUNTIME_DISPATCH || \
|
||||
(HWY_ENABLED_BASELINE & (HWY_SVE2 | HWY_SVE2_128)))
|
||||
#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE2 0
|
||||
#endif
|
||||
|
||||
// Attainable means enabled and the compiler allows intrinsics (even when not
|
||||
// allowed to autovectorize). Used in 3 and 4.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
|
||||
HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
|
||||
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
|
||||
HWY_ATTAINABLE_SVE2)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
(HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
|
||||
#endif
|
||||
|
||||
// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
|
||||
#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_EMU128 // override baseline
|
||||
#define HWY_TARGETS HWY_EMU128
|
||||
|
||||
// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
|
||||
// we currently still support it for backwards compatibility.
|
||||
#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
|
||||
(defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
|
||||
#define HWY_TARGETS HWY_SCALAR
|
||||
|
||||
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
|
||||
#elif defined(HWY_COMPILE_ONLY_STATIC)
|
||||
#define HWY_TARGETS HWY_STATIC_TARGET
|
||||
|
||||
// 3) For tests: include all attainable targets (in particular: scalar)
|
||||
#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
|
||||
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
|
||||
|
||||
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
|
||||
// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
|
||||
// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
|
||||
// sets all lower bits (better targets), then we also include the static target.
|
||||
#else
|
||||
#define HWY_TARGETS \
|
||||
(HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
|
||||
|
||||
#endif // target policy
|
||||
|
||||
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
|
||||
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
|
||||
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
|
||||
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
|
||||
#error "Logic error: best baseline should be included in dynamic targets"
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_DETECT_TARGETS_H_
|
||||
@@ -0,0 +1,254 @@
|
||||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __STDC_FORMAT_MACROS
|
||||
#define __STDC_FORMAT_MACROS // before inttypes.h
|
||||
#endif
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <memory>
|
||||
#include <numeric> // iota
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These templates are not found via ADL.
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
|
||||
#endif
|
||||
|
||||
class TwoArray {
|
||||
public:
|
||||
// Must be a multiple of the vector lane count * 8.
|
||||
static size_t NumItems() { return 3456; }
|
||||
|
||||
TwoArray()
|
||||
: a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
|
||||
// = 1, but compiler doesn't know
|
||||
const float init = static_cast<float>(Unpredictable1());
|
||||
std::iota(a_.get(), a_.get() + NumItems(), init);
|
||||
std::iota(b_, b_ + NumItems(), init);
|
||||
}
|
||||
|
||||
protected:
|
||||
AlignedFreeUniquePtr<float[]> a_;
|
||||
float* b_;
|
||||
};
|
||||
|
||||
// Measures durations, verifies results, prints timings.
|
||||
template <class Benchmark>
|
||||
void RunBenchmark(const char* caption) {
|
||||
printf("%10s: ", caption);
|
||||
const size_t kNumInputs = 1;
|
||||
const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
|
||||
const FuncInput inputs[kNumInputs] = {num_items};
|
||||
Result results[kNumInputs];
|
||||
|
||||
Benchmark benchmark;
|
||||
|
||||
Params p;
|
||||
p.verbose = false;
|
||||
p.max_evals = 7;
|
||||
p.target_rel_mad = 0.002;
|
||||
const size_t num_results = MeasureClosure(
|
||||
[&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
|
||||
kNumInputs, results, p);
|
||||
if (num_results != kNumInputs) {
|
||||
fprintf(stderr, "MeasureClosure failed.\n");
|
||||
}
|
||||
|
||||
benchmark.Verify(num_items);
|
||||
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
const double cycles_per_item =
|
||||
results[i].ticks / static_cast<double>(results[i].input);
|
||||
const double mad = results[i].variability * cycles_per_item;
|
||||
printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
|
||||
static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
|
||||
}
|
||||
}
|
||||
|
||||
void Intro() {
|
||||
const float in[16] = {1, 2, 3, 4, 5, 6};
|
||||
float out[16];
|
||||
const ScalableTag<float> d; // largest possible vector
|
||||
for (size_t i = 0; i < 16; i += Lanes(d)) {
|
||||
const auto vec = LoadU(d, in + i); // no alignment requirement
|
||||
auto result = Mul(vec, vec);
|
||||
result = Add(result, result); // can update if not const
|
||||
StoreU(result, d, out + i);
|
||||
}
|
||||
printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
|
||||
}
|
||||
|
||||
// BEGINNER: dot product
|
||||
// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
|
||||
class BenchmarkDot : public TwoArray {
|
||||
public:
|
||||
BenchmarkDot() : dot_{-1.0f} {}
|
||||
|
||||
FuncOutput operator()(const size_t num_items) {
|
||||
const ScalableTag<float> d;
|
||||
const size_t N = Lanes(d);
|
||||
using V = decltype(Zero(d));
|
||||
// Compiler doesn't make independent sum* accumulators, so unroll manually.
|
||||
// We cannot use an array because V might be a sizeless type. For reasonable
|
||||
// code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
|
||||
V sum0 = Zero(d);
|
||||
V sum1 = Zero(d);
|
||||
V sum2 = Zero(d);
|
||||
V sum3 = Zero(d);
|
||||
const float* const HWY_RESTRICT pa = &a_[0];
|
||||
const float* const HWY_RESTRICT pb = b_;
|
||||
for (size_t i = 0; i < num_items; i += 4 * N) {
|
||||
const auto a0 = Load(d, pa + i + 0 * N);
|
||||
const auto b0 = Load(d, pb + i + 0 * N);
|
||||
sum0 = MulAdd(a0, b0, sum0);
|
||||
const auto a1 = Load(d, pa + i + 1 * N);
|
||||
const auto b1 = Load(d, pb + i + 1 * N);
|
||||
sum1 = MulAdd(a1, b1, sum1);
|
||||
const auto a2 = Load(d, pa + i + 2 * N);
|
||||
const auto b2 = Load(d, pb + i + 2 * N);
|
||||
sum2 = MulAdd(a2, b2, sum2);
|
||||
const auto a3 = Load(d, pa + i + 3 * N);
|
||||
const auto b3 = Load(d, pb + i + 3 * N);
|
||||
sum3 = MulAdd(a3, b3, sum3);
|
||||
}
|
||||
// Reduction tree: sum of all accumulators by pairs into sum0.
|
||||
sum0 = Add(sum0, sum1);
|
||||
sum2 = Add(sum2, sum3);
|
||||
sum0 = Add(sum0, sum2);
|
||||
dot_ = GetLane(SumOfLanes(d, sum0));
|
||||
return static_cast<FuncOutput>(dot_);
|
||||
}
|
||||
void Verify(size_t num_items) {
|
||||
if (dot_ == -1.0f) {
|
||||
fprintf(stderr, "Dot: must call Verify after benchmark");
|
||||
abort();
|
||||
}
|
||||
|
||||
const float expected =
|
||||
std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
|
||||
const float rel_err = std::abs(expected - dot_) / expected;
|
||||
if (rel_err > 1.1E-6f) {
|
||||
fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
|
||||
rel_err);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
float dot_; // for Verify
|
||||
};
|
||||
|
||||
// INTERMEDIATE: delta coding
|
||||
// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
|
||||
struct BenchmarkDelta : public TwoArray {
|
||||
FuncOutput operator()(const size_t num_items) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
b_[0] = a_[0];
|
||||
for (size_t i = 1; i < num_items; ++i) {
|
||||
b_[i] = a_[i] - a_[i - 1];
|
||||
}
|
||||
#elif HWY_CAP_GE256
|
||||
// Larger vectors are split into 128-bit blocks, easiest to use the
|
||||
// unaligned load support to shift between them.
|
||||
const ScalableTag<float> df;
|
||||
const size_t N = Lanes(df);
|
||||
size_t i;
|
||||
b_[0] = a_[0];
|
||||
for (i = 1; i < N; ++i) {
|
||||
b_[i] = a_[i] - a_[i - 1];
|
||||
}
|
||||
for (; i < num_items; i += N) {
|
||||
const auto a = Load(df, &a_[i]);
|
||||
const auto shifted = LoadU(df, &a_[i - 1]);
|
||||
Store(a - shifted, df, &b_[i]);
|
||||
}
|
||||
#else // 128-bit
|
||||
// Slightly better than unaligned loads
|
||||
const HWY_CAPPED(float, 4) df;
|
||||
const size_t N = Lanes(df);
|
||||
size_t i;
|
||||
b_[0] = a_[0];
|
||||
for (i = 1; i < N; ++i) {
|
||||
b_[i] = a_[i] - a_[i - 1];
|
||||
}
|
||||
auto prev = Load(df, &a_[0]);
|
||||
for (; i < num_items; i += Lanes(df)) {
|
||||
const auto a = Load(df, &a_[i]);
|
||||
const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
|
||||
prev = a;
|
||||
Store(Sub(a, shifted), df, &b_[i]);
|
||||
}
|
||||
#endif
|
||||
return static_cast<FuncOutput>(b_[num_items - 1]);
|
||||
}
|
||||
|
||||
void Verify(size_t num_items) {
|
||||
for (size_t i = 0; i < num_items; ++i) {
|
||||
const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
|
||||
const float err = std::abs(expected - b_[i]);
|
||||
if (err > 1E-6f) {
|
||||
fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void RunBenchmarks() {
|
||||
Intro();
|
||||
printf("------------------------ %s\n", TargetName(HWY_TARGET));
|
||||
RunBenchmark<BenchmarkDot>("dot");
|
||||
RunBenchmark<BenchmarkDelta>("delta");
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_EXPORT(RunBenchmarks);
|
||||
|
||||
void Run() {
|
||||
for (int64_t target : SupportedAndGeneratedTargets()) {
|
||||
SetSupportedTargetsForTest(target);
|
||||
HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
|
||||
}
|
||||
SetSupportedTargetsForTest(0); // Reset the mask afterwards.
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
int main(int /*argc*/, char** /*argv*/) {
|
||||
hwy::Run();
|
||||
return 0;
|
||||
}
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,66 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Demo of functions that might be called from multiple SIMD modules (either
|
||||
// other -inl.h files, or a .cc file between begin/end_target-inl). This is
|
||||
// optional - all SIMD code can reside in .cc files. However, this allows
|
||||
// splitting code into different files while still inlining instead of requiring
|
||||
// calling through function pointers.
|
||||
|
||||
// Per-target include guard. This is only required when using dynamic dispatch,
|
||||
// i.e. including foreach_target.h. For static dispatch, a normal include
|
||||
// guard would be fine because the header is only compiled once.
|
||||
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#endif
|
||||
|
||||
// It is fine to #include normal or *-inl headers.
|
||||
#include <stddef.h>
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace skeleton {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Highway ops reside here; ADL does not find templates nor builtins.
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
// Example of a type-agnostic (caller-specified lane type) and width-agnostic
|
||||
// (uses best available instruction set) function in a header.
|
||||
//
|
||||
// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
|
||||
template <class D, typename T>
|
||||
HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
|
||||
const T* HWY_RESTRICT add_array,
|
||||
const size_t size, T* HWY_RESTRICT x_array) {
|
||||
for (size_t i = 0; i < size; i += hn::Lanes(d)) {
|
||||
const auto mul = hn::Load(d, mul_array + i);
|
||||
const auto add = hn::Load(d, add_array + i);
|
||||
auto x = hn::Load(d, x_array + i);
|
||||
x = hn::MulAdd(mul, x, add);
|
||||
hn::Store(x, d, x_array + i);
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace skeleton
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // include guard
|
||||
@@ -0,0 +1,121 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/examples/skeleton.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
// >>>> for dynamic dispatch only, skip if you want static dispatch
|
||||
|
||||
// First undef to prevent error when re-included.
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// For dynamic dispatch, specify the name of the current file (unfortunately
|
||||
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
|
||||
// Generates code for each enabled target by re-including this source file.
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// <<<< end of dynamic dispatch
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/highway.h"
|
||||
|
||||
// Optional, can instead add HWY_ATTR to all functions.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
|
||||
namespace skeleton {
|
||||
// This namespace name is unique per target, which allows code for multiple
|
||||
// targets to co-exist in the same translation unit. Required when using dynamic
|
||||
// dispatch, otherwise optional.
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Highway ops reside here; ADL does not find templates nor builtins.
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
// Computes log2 by converting to a vector of floats. Compiled once per target.
|
||||
template <class DF>
|
||||
HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
|
||||
const uint8_t* HWY_RESTRICT values,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
// Type tags for converting to other element types (Rebind = same count).
|
||||
const hn::RebindToSigned<DF> d32;
|
||||
const hn::Rebind<uint8_t, DF> d8;
|
||||
|
||||
const auto u8 = hn::Load(d8, values);
|
||||
const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
|
||||
const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
|
||||
hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
|
||||
}
|
||||
|
||||
void CodepathDemo() {
|
||||
// Highway defaults to portability, but per-target codepaths may be selected
|
||||
// via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
|
||||
#if HWY_HAVE_INTEGER64
|
||||
const char* gather = "Has int64";
|
||||
#else
|
||||
const char* gather = "No int64";
|
||||
#endif
|
||||
printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
|
||||
}
|
||||
|
||||
void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
CodepathDemo();
|
||||
|
||||
const hn::ScalableTag<float> df;
|
||||
const size_t N = hn::Lanes(df);
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
OneFloorLog2(df, values + i, log2 + i);
|
||||
}
|
||||
for (; i < count; ++i) {
|
||||
hn::CappedTag<float, 1> d1;
|
||||
OneFloorLog2(d1, values + i, log2 + i);
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace skeleton
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
// The table of pointers to the various implementations in HWY_NAMESPACE must
|
||||
// be compiled only once (foreach_target #includes this file multiple times).
|
||||
// HWY_ONCE is true for only one of these 'compilation passes'.
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace skeleton {
|
||||
|
||||
// This macro declares a static array used for dynamic dispatch; it resides in
|
||||
// the same outer namespace that contains FloorLog2.
|
||||
HWY_EXPORT(FloorLog2);
|
||||
|
||||
// This function is optional and only needed in the case of exposing it in the
|
||||
// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
|
||||
// is equivalent to inlining this function.
|
||||
HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
|
||||
const size_t count,
|
||||
uint8_t* HWY_RESTRICT out) {
|
||||
// This must reside outside of HWY_NAMESPACE because it references (calls the
|
||||
// appropriate one from) the per-target implementations there.
|
||||
// For static dispatch, use HWY_STATIC_DISPATCH.
|
||||
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
|
||||
}
|
||||
|
||||
// Optional: anything to compile only once, e.g. non-SIMD implementations of
|
||||
// public functions provided by this module, can go inside #if HWY_ONCE.
|
||||
|
||||
} // namespace skeleton
|
||||
#endif // HWY_ONCE
|
||||
@@ -0,0 +1,36 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Demo interface to target-specific code in skeleton.cc
|
||||
|
||||
// Normal header with include guard and namespace.
|
||||
#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
||||
#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
// Platform-specific definitions used for declaring an interface, independent of
|
||||
// the SIMD instruction set.
|
||||
#include "hwy/base.h" // HWY_RESTRICT
|
||||
|
||||
namespace skeleton {
|
||||
|
||||
// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
|
||||
HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
|
||||
const size_t count, uint8_t* HWY_RESTRICT out);
|
||||
|
||||
} // namespace skeleton
|
||||
|
||||
#endif // HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
||||
@@ -0,0 +1,110 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Example of unit test for the "skeleton" library.
|
||||
|
||||
#include "hwy/examples/skeleton.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
// Optional: factor out parts of the implementation into *-inl.h
|
||||
// (must also come after foreach_target.h to avoid redefinition errors)
|
||||
#include "hwy/examples/skeleton-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace skeleton {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
// Calls function defined in skeleton.cc.
|
||||
struct TestFloorLog2 {
|
||||
template <class T, class DF>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, DF df) {
|
||||
const size_t count = 5 * hn::Lanes(df);
|
||||
auto in = hwy::AllocateAligned<uint8_t>(count);
|
||||
auto expected = hwy::AllocateAligned<uint8_t>(count);
|
||||
|
||||
hwy::RandomState rng;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = Random32(&rng) & 7;
|
||||
in[i] = static_cast<uint8_t>(1u << expected[i]);
|
||||
}
|
||||
auto out = hwy::AllocateAligned<uint8_t>(count);
|
||||
CallFloorLog2(in.get(), count, out.get());
|
||||
int sum = 0;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
HWY_ASSERT_EQ(expected[i], out[i]);
|
||||
sum += out[i];
|
||||
}
|
||||
hwy::PreventElision(sum);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFloorLog2() {
|
||||
hn::ForPartialVectors<TestFloorLog2>()(float());
|
||||
}
|
||||
|
||||
// Calls function defined in skeleton-inl.h.
|
||||
struct TestSumMulAdd {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
hwy::RandomState rng;
|
||||
const size_t count = 4096;
|
||||
EXPECT_EQ(0, count % hn::Lanes(d));
|
||||
auto mul = hwy::AllocateAligned<T>(count);
|
||||
auto x = hwy::AllocateAligned<T>(count);
|
||||
auto add = hwy::AllocateAligned<T>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
mul[i] = static_cast<T>(Random32(&rng) & 0xF);
|
||||
x[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
||||
add[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
double expected_sum = 0.0;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected_sum += mul[i] * x[i] + add[i];
|
||||
}
|
||||
|
||||
MulAddLoop(d, mul.get(), add.get(), count, x.get());
|
||||
HWY_ASSERT_EQ(4344240.0, expected_sum);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSumMulAdd() {
|
||||
hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace skeleton
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace skeleton {
|
||||
HWY_BEFORE_TEST(SkeletonTest);
|
||||
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
|
||||
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
|
||||
} // namespace skeleton
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,261 @@
|
||||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
|
||||
#define HIGHWAY_HWY_FOREACH_TARGET_H_
|
||||
|
||||
// Re-includes the translation unit zero or more times to compile for any
|
||||
// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
|
||||
// highway.h defines the corresponding macro/namespace.
|
||||
|
||||
#include "hwy/detect_targets.h"
|
||||
|
||||
// *_inl.h may include other headers, which requires include guards to prevent
|
||||
// repeated inclusion. The guards must be reset after compiling each target, so
|
||||
// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
|
||||
// defining it if undefined and vice versa. This macro is initially undefined
|
||||
// so that IDEs don't gray out the contents of each header.
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#error "This macro must not be defined outside foreach_target.h"
|
||||
#endif
|
||||
|
||||
#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard
|
||||
// Trigger fixup at the bottom of this header.
|
||||
#define HWY_ALREADY_INCLUDED
|
||||
|
||||
// The next highway.h must re-include set_macros-inl.h because the first
|
||||
// highway.h chose the static target instead of what we will set below.
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
|
||||
// Disable HWY_EXPORT in user code until we have generated all targets. Note
|
||||
// that a subsequent highway.h will not override this definition.
|
||||
#undef HWY_ONCE
|
||||
#define HWY_ONCE (0 || HWY_IDE)
|
||||
|
||||
// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
|
||||
// also skip if only 1 target defined (no re-inclusion will be necessary).
|
||||
#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
|
||||
|
||||
#if !defined(HWY_TARGET_INCLUDE)
|
||||
#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_EMU128
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SCALAR
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_NEON
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_RVV
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE_256
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE2_128
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSSE3
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSE4
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3_DL
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM_EMU256
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_PPC8
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
|
||||
|
||||
// Now that all but the static target have been generated, re-enable HWY_EXPORT.
|
||||
#undef HWY_ONCE
|
||||
#define HWY_ONCE 1
|
||||
|
||||
// If we re-include once per enabled target, the translation unit's
|
||||
// implementation would have to be skipped via #if to avoid redefining symbols.
|
||||
// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
|
||||
// implementation when resuming compilation of the translation unit.
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
#ifdef HWY_ALREADY_INCLUDED
|
||||
// Revert the previous toggle to prevent redefinitions for the static target.
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
|
||||
// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
|
||||
#ifdef HWY_SET_MACROS_PER_TARGET
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#else
|
||||
#define HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_FOREACH_TARGET_H_
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user