diff --git a/media/libjpeg/ChangeLog.md b/media/libjpeg/ChangeLog.md
new file mode 100644
index 0000000000..e6700c3c27
--- /dev/null
+++ b/media/libjpeg/ChangeLog.md
@@ -0,0 +1,1839 @@
+2.1.3
+=====
+
+### Significant changes relative to 2.1.2
+
+1. Fixed a regression introduced by 2.0 beta1[7] whereby cjpeg compressed PGM
+input files into full-color JPEG images unless the `-grayscale` option was
+used.
+
+2. cjpeg now automatically compresses GIF and 8-bit BMP input files into
+grayscale JPEG images if the input files contain only shades of gray.
+
+3. The build system now enables the intrinsics implementation of the AArch64
+(Arm 64-bit) Neon SIMD extensions by default when using GCC 12 or later.
+
+4. Fixed a segfault that occurred while decompressing a 4:2:0 JPEG image using
+the merged (non-fancy) upsampling algorithms (that is, with
+`cinfo.do_fancy_upsampling` set to `FALSE`) along with `jpeg_crop_scanline()`.
+Specifically, the segfault occurred if the number of bytes remaining in the
+output buffer was less than the number of bytes required to represent one
+uncropped scanline of the output image.  For that reason, the issue could only
+be reproduced using the libjpeg API, not using djpeg.
+
+
+2.1.2
+=====
+
+### Significant changes relative to 2.1.1
+
+1. Fixed a regression introduced by 2.1 beta1[13] that caused the remaining
+GAS implementations of AArch64 (Arm 64-bit) Neon SIMD functions (which are used
+by default with GCC for performance reasons) to be placed in the `.rodata`
+section rather than in the `.text` section.  This caused the GNU linker to
+automatically place the `.rodata` section in an executable segment, which
+prevented libjpeg-turbo from working properly with other linkers and also
+represented a potential security risk.
+
+2. Fixed an issue whereby the `tjTransform()` function incorrectly computed the
+MCU block size for 4:4:4 JPEG images with non-unary sampling factors and thus
+unduly rejected some cropping regions, even though those regions aligned with
+8x8 MCU block boundaries.
+
+3. Fixed a regression introduced by 2.1 beta1[13] that caused the build system
+to enable the Arm Neon SIMD extensions when targetting Armv6 and other legacy
+architectures that do not support Neon instructions.
+
+4. libjpeg-turbo now performs run-time detection of AltiVec instructions on
+FreeBSD/PowerPC systems if AltiVec instructions are not enabled at compile
+time.  This allows both AltiVec-equipped and non-AltiVec-equipped CPUs to be
+supported using the same build of libjpeg-turbo.
+
+5. cjpeg now accepts a `-strict` argument similar to that of djpeg and
+jpegtran, which causes the compressor to abort if an LZW-compressed GIF input
+image contains incomplete or corrupt image data.
+
+
+2.1.1
+=====
+
+### Significant changes relative to 2.1.0
+
+1. Fixed a regression introduced in 2.1.0 that caused build failures with
+non-GCC-compatible compilers for Un*x/Arm platforms.
+
+2. Fixed a regression introduced by 2.1 beta1[13] that prevented the Arm 32-bit
+(AArch32) Neon SIMD extensions from building unless the C compiler flags
+included `-mfloat-abi=softfp` or `-mfloat-abi=hard`.
+
+3. Fixed an issue in the AArch32 Neon SIMD Huffman encoder whereby reliance on
+undefined C compiler behavior led to crashes ("SIGBUS: illegal alignment") on
+Android systems when running AArch32/Thumb builds of libjpeg-turbo built with
+recent versions of Clang.
+
+4. Added a command-line argument (`-copy icc`) to jpegtran that causes it to
+copy only the ICC profile markers from the source file and discard any other
+metadata.
+
+5. libjpeg-turbo should now build and run on CHERI-enabled architectures, which
+use capability pointers that are larger than the size of `size_t`.
+
+6. Fixed a regression (CVE-2021-37972) introduced by 2.1 beta1[5] that caused a
+segfault in the 64-bit SSE2 Huffman encoder when attempting to losslessly
+transform a specially-crafted malformed JPEG image.
+
+
+2.1.0
+=====
+
+### Significant changes relative to 2.1 beta1
+
+1. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress certain progressive JPEG images with one or more component planes of
+width 8 or less caused a buffer overrun.
+
+2. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress a specially-crafted malformed progressive JPEG image caused the
+block smoothing algorithm to read from uninitialized memory.
+
+3. Fixed an issue in the Arm Neon SIMD Huffman encoders that caused the
+encoders to generate incorrect results when using the Clang compiler with
+Visual Studio.
+
+4. Fixed a floating point exception (CVE-2021-20205) that occurred when
+attempting to compress a specially-crafted malformed GIF image with a specified
+image width of 0 using cjpeg.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 32 and non-zero
+successive approximation low bit positions would, under certain circumstances,
+result in an error ("Missing Huffman code table entry") and an invalid JPEG
+image.
+
+6. Introduced a new flag (`TJFLAG_LIMITSCANS` in the TurboJPEG C API and
+`TJ.FLAG_LIMIT_SCANS` in the TurboJPEG Java API) and a corresponding TJBench
+command-line argument (`-limitscans`) that causes the TurboJPEG decompression
+and transform functions/operations to return/throw an error if a progressive
+JPEG image contains an unreasonably large number of scans.  This allows
+applications that use the TurboJPEG API to guard against an exploit of the
+progressive JPEG format described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+7. The PPM reader now throws an error, rather than segfaulting (due to a buffer
+overrun) or generating incorrect pixels, if an application attempts to use the
+`tjLoadImage()` function to load a 16-bit binary PPM file (a binary PPM file
+with a maximum value greater than 255) into a grayscale image buffer or to load
+a 16-bit binary PGM file into an RGB image buffer.
+
+8. Fixed an issue in the PPM reader that caused incorrect pixels to be
+generated when using the `tjLoadImage()` function to load a 16-bit binary PPM
+file into an extended RGB image buffer.
+
+9. Fixed an issue whereby, if a JPEG buffer was automatically re-allocated by
+one of the TurboJPEG compression or transform functions and an error
+subsequently occurred during compression or transformation, the JPEG buffer
+pointer passed by the application was not updated when the function returned.
+
+
+2.0.90 (2.1 beta1)
+==================
+
+### Significant changes relative to 2.0.6:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers.  The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+     Caveats:
+     - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+     - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms.  Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+     - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+     - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+     - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+     - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+     - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+    The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (Armv7 or Armv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds.  32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. 32-bit (i386) OS X/macOS builds of libjpeg-turbo are no longer supported,
+and the libjpeg-turbo build system can no longer be used to package such
+builds.  32-bit Mac applications cannot run in macOS 10.15 "Catalina" and
+later, and the App Store no longer allows them.
+
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+significantly optimized, resulting in a measured average overall compression
+speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
+and AMD CPUs, as well as a measured average overall compression speedup of
+0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
+implementation.
+
+6. The block smoothing algorithm that is applied by default when decompressing
+progressive Huffman-encoded JPEG images has been improved in the following
+ways:
+
+     - The algorithm is now more fault-tolerant.  Previously, if a particular
+scan was incomplete, then the smoothing parameters for the incomplete scan
+would be applied to the entire output image, including the parts of the image
+that were generated by the prior (complete) scan.  Visually, this had the
+effect of removing block smoothing from lower-frequency scans if they were
+followed by an incomplete higher-frequency scan.  libjpeg-turbo now applies
+block smoothing parameters to each iMCU row based on which scan generated the
+pixels in that row, rather than always using the block smoothing parameters for
+the most recent scan.
+     - When applying block smoothing to DC scans, a Gaussian-like kernel with a
+5x5 window is used to reduce the "blocky" appearance.
+
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
+
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+instructions, so that the Loongson MMI SIMD extensions can be included in any
+MIPS64 libjpeg-turbo build.
+
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+methods by which applications can guard against the exploits of the JPEG format
+described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+     - Both programs now accept a `-maxscans` argument, which can be used to
+limit the number of allowable scans in the input file.
+     - Both programs now accept a `-strict` argument, which can be used to
+treat all warnings as fatal.
+
+10. CMake package config files are now included for both the libjpeg and
+TurboJPEG API libraries.  This facilitates using libjpeg-turbo with CMake's
+`find_package()` function.  For example:
+
+        find_package(libjpeg-turbo CONFIG REQUIRED)
+
+        add_executable(libjpeg_program libjpeg_program.c)
+        target_link_libraries(libjpeg_program PUBLIC libjpeg-turbo::jpeg)
+
+        add_executable(libjpeg_program_static libjpeg_program.c)
+        target_link_libraries(libjpeg_program_static PUBLIC
+          libjpeg-turbo::jpeg-static)
+
+        add_executable(turbojpeg_program turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program PUBLIC
+          libjpeg-turbo::turbojpeg)
+
+        add_executable(turbojpeg_program_static turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program_static PUBLIC
+          libjpeg-turbo::turbojpeg-static)
+
+11. Since the Unisys LZW patent has long expired, cjpeg and djpeg can now
+read/write both LZW-compressed and uncompressed GIF files (feature ported from
+jpeg-6a and jpeg-9d.)
+
+12. jpegtran now includes the `-wipe` and `-drop` options from jpeg-9a and
+jpeg-9d, as well as the ability to expand the image size using the `-crop`
+option.  Refer to jpegtran.1 or usage.txt for more details.
+
+13. Added a complete intrinsics implementation of the Arm Neon SIMD extensions,
+thus providing SIMD acceleration on Arm platforms for all of the algorithms
+that are SIMD-accelerated on x86 platforms.  This new implementation is
+significantly faster in some cases than the old GAS implementation--
+depending on the algorithms used, the type of CPU core, and the compiler.  GCC,
+as of this writing, does not provide a full or optimal set of Neon intrinsics,
+so for performance reasons, the default when building libjpeg-turbo with GCC is
+to continue using the GAS implementation of the following algorithms:
+
+     - 32-bit RGB-to-YCbCr color conversion
+     - 32-bit fast and accurate inverse DCT
+     - 64-bit RGB-to-YCbCr and YCbCr-to-RGB color conversion
+     - 64-bit accurate forward and inverse DCT
+     - 64-bit Huffman encoding
+
+    A new CMake variable (`NEON_INTRINSICS`) can be used to override this
+default.
+
+    Since the new intrinsics implementation includes SIMD acceleration
+for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
+been reverted.
+
+14. The Arm Neon SIMD extensions can now be built using Visual Studio.
+
+15. The build system can now be used to generate a universal x86-64 + Armv8
+libjpeg-turbo SDK package for both iOS and macOS.
+
+
+2.0.6
+=====
+
+### Significant changes relative to 2.0.5:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when using any of the YUV encoding/compression/decompression/decoding
+methods in the TurboJPEG Java API.
+
+2. Fixed or worked around multiple issues with `jpeg_skip_scanlines()`:
+
+     - Fixed segfaults or "Corrupt JPEG data: premature end of data segment"
+errors in `jpeg_skip_scanlines()` that occurred when decompressing 4:2:2 or
+4:2:0 JPEG images using merged (non-fancy) upsampling/color conversion (that
+is, when setting `cinfo.do_fancy_upsampling` to `FALSE`.)  2.0.0[6] was a
+similar fix, but it did not cover all cases.
+     - `jpeg_skip_scanlines()` now throws an error if two-pass color
+quantization is enabled.  Two-pass color quantization never worked properly
+with `jpeg_skip_scanlines()`, and the issues could not readily be fixed.
+     - Fixed an issue whereby `jpeg_skip_scanlines()` always returned 0 when
+skipping past the end of an image.
+
+3. The Arm 64-bit (Armv8) Neon SIMD extensions can now be built using MinGW
+toolchains targetting Arm64 (AArch64) Windows binaries.
+
+4. Fixed unexpected visual artifacts that occurred when using
+`jpeg_crop_scanline()` and interblock smoothing while decompressing only the DC
+scan of a progressive JPEG image.
+
+5. Fixed an issue whereby libjpeg-turbo would not build if 12-bit-per-component
+JPEG support (`WITH_12BIT`) was enabled along with libjpeg v7 or libjpeg v8
+API/ABI emulation (`WITH_JPEG7` or `WITH_JPEG8`.)
+
+
+2.0.5
+=====
+
+### Significant changes relative to 2.0.4:
+
+1. Worked around issues in the MIPS DSPr2 SIMD extensions that caused failures
+in the libjpeg-turbo regression tests.  Specifically, the
+`jsimd_h2v1_downsample_dspr2()` and `jsimd_h2v2_downsample_dspr2()` functions
+in the MIPS DSPr2 SIMD extensions are now disabled until/unless they can be
+fixed, and other functions that are incompatible with big endian MIPS CPUs are
+disabled when building libjpeg-turbo for such CPUs.
+
+2. Fixed an oversight in the `TJCompressor.compress(int)` method in the
+TurboJPEG Java API that caused an error ("java.lang.IllegalStateException: No
+source image is associated with this instance") when attempting to use that
+method to compress a YUV image.
+
+3. Fixed an issue (CVE-2020-13790) in the PPM reader that caused a buffer
+overrun in cjpeg, TJBench, or the `tjLoadImage()` function if one of the values
+in a binary PPM/PGM input file exceeded the maximum value defined in the file's
+header and that maximum value was less than 255.  libjpeg-turbo 1.5.0 already
+included a similar fix for binary PPM/PGM files with maximum values greater
+than 255.
+
+4. The TurboJPEG API library's global error handler, which is used in functions
+such as `tjBufSize()` and `tjLoadImage()` that do not require a TurboJPEG
+instance handle, is now thread-safe on platforms that support thread-local
+storage.
+
+
+2.0.4
+=====
+
+### Significant changes relative to 2.0.3:
+
+1. Fixed a regression in the Windows packaging system (introduced by
+2.0 beta1[2]) whereby, if both the 64-bit libjpeg-turbo SDK for GCC and the
+64-bit libjpeg-turbo SDK for Visual C++ were installed on the same system, only
+one of them could be uninstalled.
+
+2. Fixed a signed integer overflow and subsequent segfault that occurred when
+attempting to decompress images with more than 715827882 pixels using the
+64-bit C version of TJBench.
+
+3. Fixed out-of-bounds write in `tjDecompressToYUV2()` and
+`tjDecompressToYUVPlanes()` (sometimes manifesting as a double free) that
+occurred when attempting to decompress grayscale JPEG images that were
+compressed with a sampling factor other than 1 (for instance, with
+`cjpeg -grayscale -sample 2x2`).
+
+4. Fixed a regression introduced by 2.0.2[5] that caused the TurboJPEG API to
+incorrectly identify some JPEG images with unusual sampling factors as 4:4:4
+JPEG images.  This was known to cause a buffer overflow when attempting to
+decompress some such images using `tjDecompressToYUV2()` or
+`tjDecompressToYUVPlanes()`.
+
+5. Fixed an issue (CVE-2020-17541), detected by ASan, whereby attempting to
+losslessly transform a specially-crafted malformed JPEG image containing an
+extremely-high-frequency coefficient block (junk image data that could never be
+generated by a legitimate JPEG compressor) could cause the Huffman encoder's
+local buffer to be overrun. (Refer to 1.4.0[9] and 1.4beta1[15].)  Given that
+the buffer overrun was fully contained within the stack and did not cause a
+segfault or other user-visible errant behavior, and given that the lossless
+transformer (unlike the decompressor) is not generally exposed to arbitrary
+data exploits, this issue did not likely pose a security risk.
+
+6. The Arm 64-bit (Armv8) Neon SIMD assembly code now stores constants in a
+separate read-only data section rather than in the text section, to support
+execute-only memory layouts.
+
+
+2.0.3
+=====
+
+### Significant changes relative to 2.0.2:
+
+1. Fixed "using JNI after critical get" errors that occurred on Android
+platforms when passing invalid arguments to certain methods in the TurboJPEG
+Java API.
+
+2. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that was known to cause an illegal
+instruction exception, in rare cases, on CPUs that lack support for CPUID leaf
+07H (or on which the maximum CPUID leaf has been limited by way of a BIOS
+setting.)
+
+3. The 4:4:0 (h1v2) fancy (smooth) chroma upsampling algorithm in the
+decompressor now uses a similar bias pattern to that of the 4:2:2 (h2v1) fancy
+chroma upsampling algorithm, rounding up or down the upsampled result for
+alternate pixels rather than always rounding down.  This ensures that,
+regardless of whether a 4:2:2 JPEG image is rotated or transposed prior to
+decompression (in the frequency domain) or after decompression (in the spatial
+domain), the final image will be similar.
+
+4. Fixed an integer overflow and subsequent segfault that occurred when
+attempting to compress or decompress images with more than 1 billion pixels
+using the TurboJPEG API.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 16 would result in an
+error ("Missing Huffman code table entry") and an invalid JPEG image.
+
+6. Fixed an issue whereby `tjDecodeYUV()` and `tjDecodeYUVPlanes()` would throw
+an error ("Invalid progressive parameters") or a warning ("Inconsistent
+progression sequence") if passed a TurboJPEG instance that was previously used
+to decompress a progressive JPEG image.
+
+
+2.0.2
+=====
+
+### Significant changes relative to 2.0.1:
+
+1. Fixed a regression introduced by 2.0.1[5] that prevented a runtime search
+path (rpath) from being embedded in the libjpeg-turbo shared libraries and
+executables for macOS and iOS.  This caused a fatal error of the form
+"dyld: Library not loaded" when attempting to use one of the executables,
+unless `DYLD_LIBRARY_PATH` was explicitly set to the location of the
+libjpeg-turbo shared libraries.
+
+2. Fixed an integer overflow and subsequent segfault (CVE-2018-20330) that
+occurred when attempting to load a BMP file with more than 1 billion pixels
+using the `tjLoadImage()` function.
+
+3. Fixed a buffer overrun (CVE-2018-19664) that occurred when attempting to
+decompress a specially-crafted malformed JPEG image to a 256-color BMP using
+djpeg.
+
+4. Fixed a floating point exception that occurred when attempting to
+decompress a specially-crafted malformed JPEG image with a specified image
+width or height of 0 using the C version of TJBench.
+
+5. The TurboJPEG API will now decompress 4:4:4 JPEG images with 2x1, 1x2, 3x1,
+or 1x3 luminance and chrominance sampling factors.  This is a non-standard way
+of specifying 1x subsampling (normally 4:4:4 JPEGs have 1x1 luminance and
+chrominance sampling factors), but the JPEG format and the libjpeg API both
+allow it.
+
+6. Fixed a regression introduced by 2.0 beta1[7] that caused djpeg to generate
+incorrect PPM images when used with the `-colors` option.
+
+7. Fixed an issue whereby a static build of libjpeg-turbo (a build in which
+`ENABLE_SHARED` is `0`) could not be installed using the Visual Studio IDE.
+
+8. Fixed a severe performance issue in the Loongson MMI SIMD extensions that
+occurred when compressing RGB images whose image rows were not 64-bit-aligned.
+
+
+2.0.1
+=====
+
+### Significant changes relative to 2.0.0:
+
+1. Fixed a regression introduced with the new CMake-based Un*x build system,
+whereby jconfig.h could cause compiler warnings of the form
+`"HAVE_*_H" redefined` if it was included by downstream Autotools-based
+projects that used `AC_CHECK_HEADERS()` to check for the existence of locale.h,
+stddef.h, or stdlib.h.
+
+2. The `jsimd_quantize_float_dspr2()` and `jsimd_convsamp_float_dspr2()`
+functions in the MIPS DSPr2 SIMD extensions are now disabled at compile time
+if the soft float ABI is enabled.  Those functions use instructions that are
+incompatible with the soft float ABI.
+
+3. Fixed a regression in the SIMD feature detection code, introduced by
+the AVX2 SIMD extensions (2.0 beta1[1]), that caused libjpeg-turbo to crash on
+Windows 7 if Service Pack 1 was not installed.
+
+4. Fixed out-of-bounds read in cjpeg that occurred when attempting to compress
+a specially-crafted malformed color-index (8-bit-per-sample) Targa file in
+which some of the samples (color indices) exceeded the bounds of the Targa
+file's color table.
+
+5. Fixed an issue whereby installing a fully static build of libjpeg-turbo
+(a build in which `CFLAGS` contains `-static` and `ENABLE_SHARED` is `0`) would
+fail with "No valid ELF RPATH or RUNPATH entry exists in the file."
+
+
+2.0.0
+=====
+
+### Significant changes relative to 2.0 beta1:
+
+1. The TurboJPEG API can now decompress CMYK JPEG images that have subsampled M
+and Y components (not to be confused with YCCK JPEG images, in which the C/M/Y
+components have been transformed into luma and chroma.)   Previously, an error
+was generated ("Could not determine subsampling type for JPEG image") when such
+an image was passed to `tjDecompressHeader3()`, `tjTransform()`,
+`tjDecompressToYUVPlanes()`, `tjDecompressToYUV2()`, or the equivalent Java
+methods.
+
+2. Fixed an issue (CVE-2018-11813) whereby a specially-crafted malformed input
+file (specifically, a file with a valid Targa header but incomplete pixel data)
+would cause cjpeg to generate a JPEG file that was potentially thousands of
+times larger than the input file.  The Targa reader in cjpeg was not properly
+detecting that the end of the input file had been reached prematurely, so after
+all valid pixels had been read from the input, the reader injected dummy pixels
+with values of 255 into the JPEG compressor until the number of pixels
+specified in the Targa header had been compressed.  The Targa reader in cjpeg
+now behaves like the PPM reader and aborts compression if the end of the input
+file is reached prematurely.  Because this issue only affected cjpeg and not
+the underlying library, and because it did not involve any out-of-bounds reads
+or other exploitable behaviors, it was not believed to represent a security
+threat.
+
+3. Fixed an issue whereby the `tjLoadImage()` and `tjSaveImage()` functions
+would produce a "Bogus message code" error message if the underlying bitmap and
+PPM readers/writers threw an error that was specific to the readers/writers
+(as opposed to a general libjpeg API error.)
+
+4. Fixed an issue (CVE-2018-1152) whereby a specially-crafted malformed BMP
+file, one in which the header specified an image width of 1073741824 pixels,
+would trigger a floating point exception (division by zero) in the
+`tjLoadImage()` function when attempting to load the BMP file into a
+4-component image buffer.
+
+5. Fixed an issue whereby certain combinations of calls to
+`jpeg_skip_scanlines()` and `jpeg_read_scanlines()` could trigger an infinite
+loop when decompressing progressive JPEG images that use vertical chroma
+subsampling (for instance, 4:2:0 or 4:4:0.)
+
+6. Fixed a segfault in `jpeg_skip_scanlines()` that occurred when decompressing
+a 4:2:2 or 4:2:0 JPEG image using the merged (non-fancy) upsampling algorithms
+(that is, when setting `cinfo.do_fancy_upsampling` to `FALSE`.)
+
+7. The new CMake-based build system will now disable the MIPS DSPr2 SIMD
+extensions if it detects that the compiler does not support DSPr2 instructions.
+
+8. Fixed out-of-bounds read in cjpeg (CVE-2018-14498) that occurred when
+attempting to compress a specially-crafted malformed color-index
+(8-bit-per-sample) BMP file in which some of the samples (color indices)
+exceeded the bounds of the BMP file's color table.
+
+9. Fixed a signed integer overflow in the progressive Huffman decoder, detected
+by the Clang and GCC undefined behavior sanitizers, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image.  This issue
+did not pose a security threat, but removing the warning made it easier to
+detect actual security issues, should they arise in the future.
+
+
+1.5.90 (2.0 beta1)
+==================
+
+### Significant changes relative to 1.5.3:
+
+1. Added AVX2 SIMD implementations of the colorspace conversion, chroma
+downsampling and upsampling, integer quantization and sample conversion, and
+accurate integer DCT/IDCT algorithms.  When using the accurate integer DCT/IDCT
+algorithms on AVX2-equipped CPUs, the compression of RGB images is
+approximately 13-36% (avg. 22%) faster (relative to libjpeg-turbo 1.5.x) with
+64-bit code and 11-21% (avg. 17%) faster with 32-bit code, and the
+decompression of RGB images is approximately 9-35% (avg. 17%) faster with
+64-bit code and 7-17% (avg. 12%) faster with 32-bit code.  (As tested on a
+3 GHz Intel Core i7.  Actual mileage may vary.)
+
+2. Overhauled the build system to use CMake on all platforms, and removed the
+autotools-based build system.  This decision resulted from extensive
+discussions within the libjpeg-turbo community.  libjpeg-turbo traditionally
+used CMake only for Windows builds, but there was an increasing amount of
+demand to extend CMake support to other platforms.  However, because of the
+unique nature of our code base (the need to support different assemblers on
+each platform, the need for Java support, etc.), providing dual build systems
+as other OSS imaging libraries do (including libpng and libtiff) would have
+created a maintenance burden.  The use of CMake greatly simplifies some aspects
+of our build system, owing to CMake's built-in support for various assemblers,
+Java, and unit testing, as well as generally fewer quirks that have to be
+worked around in order to implement our packaging system.  Eliminating
+autotools puts our project slightly at odds with the traditional practices of
+the OSS community, since most "system libraries" tend to be built with
+autotools, but it is believed that the benefits of this move outweigh the
+risks.  In addition to providing a unified build environment, switching to
+CMake allows for the use of various build tools and IDEs that aren't supported
+under autotools, including XCode, Ninja, and Eclipse.  It also eliminates the
+need to install autotools via MacPorts/Homebrew on OS X and allows
+libjpeg-turbo to be configured without the use of a terminal/command prompt.
+Extensive testing was conducted to ensure that all features provided by the
+autotools-based build system are provided by the new build system.
+
+3. The libjpeg API in this version of libjpeg-turbo now includes two additional
+functions, `jpeg_read_icc_profile()` and `jpeg_write_icc_profile()`, that can
+be used to extract ICC profile data from a JPEG file while decompressing or to
+embed ICC profile data in a JPEG file while compressing or transforming.  This
+eliminates the need for downstream projects, such as color management libraries
+and browsers, to include their own glueware for accomplishing this.
+
+4. Improved error handling in the TurboJPEG API library:
+
+     - Introduced a new function (`tjGetErrorStr2()`) in the TurboJPEG C API
+that allows compression/decompression/transform error messages to be retrieved
+in a thread-safe manner.  Retrieving error messages from global functions, such
+as `tjInitCompress()` or `tjBufSize()`, is still thread-unsafe, but since those
+functions will only throw errors if passed an invalid argument or if a memory
+allocation failure occurs, thread safety is not as much of a concern.
+     - Introduced a new function (`tjGetErrorCode()`) in the TurboJPEG C API
+and a new method (`TJException.getErrorCode()`) in the TurboJPEG Java API that
+can be used to determine the severity of the last
+compression/decompression/transform error.  This allows applications to
+choose whether to ignore warnings (non-fatal errors) from the underlying
+libjpeg API or to treat them as fatal.
+     - Introduced a new flag (`TJFLAG_STOPONWARNING` in the TurboJPEG C API and
+`TJ.FLAG_STOPONWARNING` in the TurboJPEG Java API) that causes the library to
+immediately halt a compression/decompression/transform operation if it
+encounters a warning from the underlying libjpeg API (the default behavior is
+to allow the operation to complete unless a fatal error is encountered.)
+
+5. Introduced a new flag in the TurboJPEG C and Java APIs (`TJFLAG_PROGRESSIVE`
+and `TJ.FLAG_PROGRESSIVE`, respectively) that causes the library to use
+progressive entropy coding in JPEG images generated by compression and
+transform operations.  Additionally, a new transform option
+(`TJXOPT_PROGRESSIVE` in the C API and `TJTransform.OPT_PROGRESSIVE` in the
+Java API) has been introduced, allowing progressive entropy coding to be
+enabled for selected transforms in a multi-transform operation.
+
+6. Introduced a new transform option in the TurboJPEG API (`TJXOPT_COPYNONE` in
+the C API and `TJTransform.OPT_COPYNONE` in the Java API) that allows the
+copying of markers (including EXIF and ICC profile data) to be disabled for a
+particular transform.
+
+7. Added two functions to the TurboJPEG C API (`tjLoadImage()` and
+`tjSaveImage()`) that can be used to load/save a BMP or PPM/PGM image to/from a
+memory buffer with a specified pixel format and layout.  These functions
+replace the project-private (and slow) bmp API, which was previously used by
+TJBench, and they also provide a convenient way for first-time users of
+libjpeg-turbo to quickly develop a complete JPEG compression/decompression
+program.
+
+8. The TurboJPEG C API now includes a new convenience array (`tjAlphaOffset[]`)
+that contains the alpha component index for each pixel format (or -1 if the
+pixel format lacks an alpha component.)  The TurboJPEG Java API now includes a
+new method (`TJ.getAlphaOffset()`) that returns the same value.  In addition,
+the `tjRedOffset[]`, `tjGreenOffset[]`, and `tjBlueOffset[]` arrays-- and the
+corresponding `TJ.getRedOffset()`, `TJ.getGreenOffset()`, and
+`TJ.getBlueOffset()` methods-- now return -1 for `TJPF_GRAY`/`TJ.PF_GRAY`
+rather than 0.  This allows programs to easily determine whether a pixel format
+has red, green, blue, and alpha components.
+
+9. Added a new example (tjexample.c) that demonstrates the basic usage of the
+TurboJPEG C API.  This example mirrors the functionality of TJExample.java.
+Both files are now included in the libjpeg-turbo documentation.
+
+10. Fixed two signed integer overflows in the arithmetic decoder, detected by
+the Clang undefined behavior sanitizer, that could be triggered by attempting
+to decompress a specially-crafted malformed JPEG image.  These issues did not
+pose a security threat, but removing the warnings makes it easier to detect
+actual security issues, should they arise in the future.
+
+11. Fixed a bug in the merged 4:2:0 upsampling/dithered RGB565 color conversion
+algorithm that caused incorrect dithering in the output image.  This algorithm
+now produces bitwise-identical results to the unmerged algorithms.
+
+12. The SIMD function symbols for x86[-64]/ELF, MIPS/ELF, macOS/x86[-64] (if
+libjpeg-turbo is built with Yasm), and iOS/Arm[64] builds are now private.
+This prevents those symbols from being exposed in applications or shared
+libraries that link statically with libjpeg-turbo.
+
+13. Added Loongson MMI SIMD implementations of the RGB-to-YCbCr and
+YCbCr-to-RGB colorspace conversion, 4:2:0 chroma downsampling, 4:2:0 fancy
+chroma upsampling, integer quantization, and accurate integer DCT/IDCT
+algorithms.  When using the accurate integer DCT/IDCT, this speeds up the
+compression of RGB images by approximately 70-100% and the decompression of RGB
+images by approximately 2-3.5x.
+
+14. Fixed a build error when building with older MinGW releases (regression
+caused by 1.5.1[7].)
+
+15. Added SIMD acceleration for progressive Huffman encoding on SSE2-capable
+x86 and x86-64 platforms.  This speeds up the compression of full-color
+progressive JPEGs by about 85-90% on average (relative to libjpeg-turbo 1.5.x)
+when using modern Intel and AMD CPUs.
+
+
+1.5.3
+=====
+
+### Significant changes relative to 1.5.2:
+
+1. Fixed a NullPointerException in the TurboJPEG Java wrapper that occurred
+when using the YUVImage constructor that creates an instance backed by separate
+image planes and allocates memory for the image planes.
+
+2. Fixed an issue whereby the Java version of TJUnitTest would fail when
+testing BufferedImage encoding/decoding on big endian systems.
+
+3. Fixed a segfault in djpeg that would occur if an output format other than
+PPM/PGM was selected along with the `-crop` option.  The `-crop` option now
+works with the GIF and Targa formats as well (unfortunately, it cannot be made
+to work with the BMP and RLE formats due to the fact that those output engines
+write scanlines in bottom-up order.)  djpeg will now exit gracefully if an
+output format other than PPM/PGM, GIF, or Targa is selected along with the
+`-crop` option.
+
+4. Fixed an issue (CVE-2017-15232) whereby `jpeg_skip_scanlines()` would
+segfault if color quantization was enabled.
+
+5. TJBench (both C and Java versions) will now display usage information if any
+command-line argument is unrecognized.  This prevents the program from silently
+ignoring typos.
+
+6. Fixed an access violation in tjbench.exe (Windows) that occurred when the
+program was used to decompress an existing JPEG image.
+
+7. Fixed an ArrayIndexOutOfBoundsException in the TJExample Java program that
+occurred when attempting to decompress a JPEG image that had been compressed
+with 4:1:1 chrominance subsampling.
+
+8. Fixed an issue whereby, when using `jpeg_skip_scanlines()` to skip to the
+end of a single-scan (non-progressive) image, subsequent calls to
+`jpeg_consume_input()` would return `JPEG_SUSPENDED` rather than
+`JPEG_REACHED_EOI`.
+
+9. `jpeg_crop_scanline()` now works correctly when decompressing grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`).
+
+
+1.5.2
+=====
+
+### Significant changes relative to 1.5.1:
+
+1. Fixed a regression introduced by 1.5.1[7] that prevented libjpeg-turbo from
+building with Android NDK platforms prior to android-21 (5.0).
+
+2. Fixed a regression introduced by 1.5.1[1] that prevented the MIPS DSPR2 SIMD
+code in libjpeg-turbo from building.
+
+3. Fixed a regression introduced by 1.5 beta1[11] that prevented the Java
+version of TJBench from outputting any reference images (the `-nowrite` switch
+was accidentally enabled by default.)
+
+4. libjpeg-turbo should now build and run with full AltiVec SIMD acceleration
+on PowerPC-based AmigaOS 4 and OpenBSD systems.
+
+5. Fixed build and runtime errors on Windows that occurred when building
+libjpeg-turbo with libjpeg v7 API/ABI emulation and the in-memory
+source/destination managers.  Due to an oversight, the `jpeg_skip_scanlines()`
+and `jpeg_crop_scanline()` functions were not being included in jpeg7.dll when
+libjpeg-turbo was built with `-DWITH_JPEG7=1` and `-DWITH_MEMSRCDST=1`.
+
+6. Fixed "Bogus virtual array access" error that occurred when using the
+lossless crop feature in jpegtran or the TurboJPEG API, if libjpeg-turbo was
+built with libjpeg v7 API/ABI emulation.  This was apparently a long-standing
+bug that has existed since the introduction of libjpeg v7/v8 API/ABI emulation
+in libjpeg-turbo v1.1.
+
+7. The lossless transform features in jpegtran and the TurboJPEG API will now
+always attempt to adjust the EXIF image width and height tags if the image size
+changed as a result of the transform.  This behavior has always existed when
+using libjpeg v8 API/ABI emulation.  It was supposed to be available with
+libjpeg v7 API/ABI emulation as well but did not work properly due to a bug.
+Furthermore, there was never any good reason not to enable it with libjpeg v6b
+API/ABI emulation, since the behavior is entirely internal.  Note that
+`-copy all` must be passed to jpegtran in order to transfer the EXIF tags from
+the source image to the destination image.
+
+8. Fixed several memory leaks in the TurboJPEG API library that could occur
+if the library was built with certain compilers and optimization levels
+(known to occur with GCC 4.x and clang with `-O1` and higher but not with
+GCC 5.x or 6.x) and one of the underlying libjpeg API functions threw an error
+after a TurboJPEG API function allocated a local buffer.
+
+9. The libjpeg-turbo memory manager will now honor the `max_memory_to_use`
+structure member in jpeg\_memory\_mgr, which can be set to the maximum amount
+of memory (in bytes) that libjpeg-turbo should use during decompression or
+multi-pass (including progressive) compression.  This limit can also be set
+using the `JPEGMEM` environment variable or using the `-maxmemory` switch in
+cjpeg/djpeg/jpegtran (refer to the respective man pages for more details.)
+This has been a documented feature of libjpeg since v5, but the
+`malloc()`/`free()` implementation of the memory manager (jmemnobs.c) never
+implemented the feature.  Restricting libjpeg-turbo's memory usage is useful
+for two reasons:  it allows testers to more easily work around the 2 GB limit
+in libFuzzer, and it allows developers of security-sensitive applications to
+more easily defend against one of the progressive JPEG exploits (LJT-01-004)
+identified in
+[this report](http://www.libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+10. TJBench will now run each benchmark for 1 second prior to starting the
+timer, in order to improve the consistency of the results.  Furthermore, the
+`-warmup` option is now used to specify the amount of warmup time rather than
+the number of warmup iterations.
+
+11. Fixed an error (`short jump is out of range`) that occurred when assembling
+the 32-bit x86 SIMD extensions with NASM versions prior to 2.04.  This was a
+regression introduced by 1.5 beta1[12].
+
+
+1.5.1
+=====
+
+### Significant changes relative to 1.5.0:
+
+1. Previously, the undocumented `JSIMD_FORCE*` environment variables could be
+used to force-enable a particular SIMD instruction set if multiple instruction
+sets were available on a particular platform.  On x86 platforms, where CPU
+feature detection is bulletproof and multiple SIMD instruction sets are
+available, it makes sense for those environment variables to allow forcing the
+use of an instruction set only if that instruction set is available.  However,
+since the ARM implementations of libjpeg-turbo can only use one SIMD
+instruction set, and since their feature detection code is less bulletproof
+(parsing /proc/cpuinfo), it makes sense for the `JSIMD_FORCENEON` environment
+variable to bypass the feature detection code and really force the use of NEON
+instructions.  A new environment variable (`JSIMD_FORCEDSPR2`) was introduced
+in the MIPS implementation for the same reasons, and the existing
+`JSIMD_FORCENONE` environment variable was extended to that implementation.
+These environment variables provide a workaround for those attempting to test
+ARM and MIPS builds of libjpeg-turbo in QEMU, which passes through
+/proc/cpuinfo from the host system.
+
+2. libjpeg-turbo previously assumed that AltiVec instructions were always
+available on PowerPC platforms, which led to "illegal instruction" errors when
+running on PowerPC chips that lack AltiVec support (such as the older 7xx/G3
+and newer e5500 series.)  libjpeg-turbo now examines /proc/cpuinfo on
+Linux/Android systems and enables AltiVec instructions only if the CPU supports
+them.  It also now provides two environment variables, `JSIMD_FORCEALTIVEC` and
+`JSIMD_FORCENONE`, to force-enable and force-disable AltiVec instructions in
+environments where /proc/cpuinfo is an unreliable means of CPU feature
+detection (such as when running in QEMU.)  On OS X, libjpeg-turbo continues to
+assume that AltiVec support is always available, which means that libjpeg-turbo
+cannot be used with G3 Macs unless you set the environment variable
+`JSIMD_FORCENONE` to `1`.
+
+3. Fixed an issue whereby 64-bit ARM (AArch64) builds of libjpeg-turbo would
+crash when built with recent releases of the Clang/LLVM compiler.  This was
+caused by an ABI conformance issue in some of libjpeg-turbo's 64-bit NEON SIMD
+routines.  Those routines were incorrectly using 64-bit instructions to
+transfer a 32-bit JDIMENSION argument, whereas the ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fancy upsampling is now supported when decompressing JPEG images that use
+4:4:0 (h1v2) chroma subsampling.  These images are generated when losslessly
+rotating or transposing JPEG images that use 4:2:2 (h2v1) chroma subsampling.
+The h1v2 fancy upsampling algorithm is not currently SIMD-accelerated.
+
+5. If merged upsampling isn't SIMD-accelerated but YCbCr-to-RGB conversion is,
+then libjpeg-turbo will now disable merged upsampling when decompressing YCbCr
+JPEG images into RGB or extended RGB output images.  This significantly speeds
+up the decompression of 4:2:0 and 4:2:2 JPEGs on ARM platforms if fancy
+upsampling is not used (for example, if the `-nosmooth` option to djpeg is
+specified.)
+
+6. The TurboJPEG API will now decompress 4:2:2 and 4:4:0 JPEG images with
+2x2 luminance sampling factors and 2x1 or 1x2 chrominance sampling factors.
+This is a non-standard way of specifying 2x subsampling (normally 4:2:2 JPEGs
+have 2x1 luminance and 1x1 chrominance sampling factors, and 4:4:0 JPEGs have
+1x2 luminance and 1x1 chrominance sampling factors), but the JPEG format and
+the libjpeg API both allow it.
+
+7. Fixed an unsigned integer overflow in the libjpeg memory manager, detected
+by the Clang undefined behavior sanitizer, that could be triggered by
+attempting to decompress a specially-crafted malformed JPEG image.  This issue
+affected only 32-bit code and did not pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+8. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers when attempting to decompress
+specially-crafted malformed JPEG images.  None of these issues posed a security
+threat, but removing the warnings makes it easier to detect actual security
+issues, should they arise in the future.
+
+9. Fixed an out-of-bounds array reference, introduced by 1.4.90[2] (partial
+image decompression) and detected by the Clang undefined behavior sanitizer,
+that could be triggered by a specially-crafted malformed JPEG image with more
+than four components.  Because the out-of-bounds reference was still within the
+same structure, it was not known to pose a security threat, but removing the
+warning makes it easier to detect actual security issues, should they arise in
+the future.
+
+10. Fixed another ABI conformance issue in the 64-bit ARM (AArch64) NEON SIMD
+code.  Some of the routines were incorrectly reading and storing data below the
+stack pointer, which caused segfaults in certain applications under specific
+circumstances.
+
+
+1.5.0
+=====
+
+### Significant changes relative to 1.5 beta1:
+
+1. Fixed an issue whereby a malformed motion-JPEG frame could cause the "fast
+path" of libjpeg-turbo's Huffman decoder to read from uninitialized memory.
+
+2. Added libjpeg-turbo version and build information to the global string table
+of the libjpeg and TurboJPEG API libraries.  This is a common practice in other
+infrastructure libraries, such as OpenSSL and libpng, because it makes it easy
+to examine an application binary and determine which version of the library the
+application was linked against.
+
+3. Fixed a couple of issues in the PPM reader that would cause buffer overruns
+in cjpeg if one of the values in a binary PPM/PGM input file exceeded the
+maximum value defined in the file's header and that maximum value was greater
+than 255.  libjpeg-turbo 1.4.2 already included a similar fix for ASCII PPM/PGM
+files.  Note that these issues were not security bugs, since they were confined
+to the cjpeg program and did not affect any of the libjpeg-turbo libraries.
+
+4. Fixed an issue whereby attempting to decompress a JPEG file with a corrupt
+header using the `tjDecompressToYUV2()` function would cause the function to
+abort without returning an error and, under certain circumstances, corrupt the
+stack.  This only occurred if `tjDecompressToYUV2()` was called prior to
+calling `tjDecompressHeader3()`, or if the return value from
+`tjDecompressHeader3()` was ignored (both cases represent incorrect usage of
+the TurboJPEG API.)
+
+5. Fixed an issue in the ARM 32-bit SIMD-accelerated Huffman encoder that
+prevented the code from assembling properly with clang.
+
+6. The `jpeg_stdio_src()`, `jpeg_mem_src()`, `jpeg_stdio_dest()`, and
+`jpeg_mem_dest()` functions in the libjpeg API will now throw an error if a
+source/destination manager has already been assigned to the compress or
+decompress object by a different function or by the calling program.  This
+prevents these functions from attempting to reuse a source/destination manager
+structure that was allocated elsewhere, because there is no way to ensure that
+it would be big enough to accommodate the new source/destination manager.
+
+
+1.4.90 (1.5 beta1)
+==================
+
+### Significant changes relative to 1.4.2:
+
+1. Added full SIMD acceleration for PowerPC platforms using AltiVec VMX
+(128-bit SIMD) instructions.  Although the performance of libjpeg-turbo on
+PowerPC was already good, due to the increased number of registers available
+to the compiler vs. x86, it was still possible to speed up compression by about
+3-4x and decompression by about 2-2.5x (relative to libjpeg v6b) through the
+use of AltiVec instructions.
+
+2. Added two new libjpeg API functions (`jpeg_skip_scanlines()` and
+`jpeg_crop_scanline()`) that can be used to partially decode a JPEG image.  See
+[libjpeg.txt](libjpeg.txt) for more details.
+
+3. The TJCompressor and TJDecompressor classes in the TurboJPEG Java API now
+implement the Closeable interface, so those classes can be used with a
+try-with-resources statement.
+
+4. The TurboJPEG Java classes now throw unchecked idiomatic exceptions
+(IllegalArgumentException, IllegalStateException) for unrecoverable errors
+caused by incorrect API usage, and those classes throw a new checked exception
+type (TJException) for errors that are passed through from the C library.
+
+5. Source buffers for the TurboJPEG C API functions, as well as the
+`jpeg_mem_src()` function in the libjpeg API, are now declared as const
+pointers.  This facilitates passing read-only buffers to those functions and
+ensures the caller that the source buffer will not be modified.  This should
+not create any backward API or ABI incompatibilities with prior libjpeg-turbo
+releases.
+
+6. The MIPS DSPr2 SIMD code can now be compiled to support either FR=0 or FR=1
+FPUs.
+
+7. Fixed additional negative left shifts and other issues reported by the GCC
+and Clang undefined behavior sanitizers.  Most of these issues affected only
+32-bit code, and none of them was known to pose a security threat, but removing
+the warnings makes it easier to detect actual security issues, should they
+arise in the future.
+
+8. Removed the unnecessary `.arch` directive from the ARM64 NEON SIMD code.
+This directive was preventing the code from assembling using the clang
+integrated assembler.
+
+9. Fixed a regression caused by 1.4.1[6] that prevented 32-bit and 64-bit
+libjpeg-turbo RPMs from being installed simultaneously on recent Red Hat/Fedora
+distributions.  This was due to the addition of a macro in jconfig.h that
+allows the Huffman codec to determine the word size at compile time.  Since
+that macro differs between 32-bit and 64-bit builds, this caused a conflict
+between the i386 and x86_64 RPMs (any differing files, other than executables,
+are not allowed when 32-bit and 64-bit RPMs are installed simultaneously.)
+Since the macro is used only internally, it has been moved into jconfigint.h.
+
+10. The x86-64 SIMD code can now be disabled at run time by setting the
+`JSIMD_FORCENONE` environment variable to `1` (the other SIMD implementations
+already had this capability.)
+
+11. Added a new command-line argument to TJBench (`-nowrite`) that prevents the
+benchmark from outputting any images.  This removes any potential operating
+system overhead that might be caused by lazy writes to disk and thus improves
+the consistency of the performance measurements.
+
+12. Added SIMD acceleration for Huffman encoding on SSE2-capable x86 and x86-64
+platforms.  This speeds up the compression of full-color JPEGs by about 10-15%
+on average (relative to libjpeg-turbo 1.4.x) when using modern Intel and AMD
+CPUs.  Additionally, this works around an issue in the clang optimizer that
+prevents it (as of this writing) from achieving the same performance as GCC
+when compiling the C version of the Huffman encoder
+(<https://llvm.org/bugs/show_bug.cgi?id=16035>).  For the purposes of
+benchmarking or regression testing, SIMD-accelerated Huffman encoding can be
+disabled by setting the `JSIMD_NOHUFFENC` environment variable to `1`.
+
+13. Added ARM 64-bit (ARMv8) NEON SIMD implementations of the commonly-used
+compression algorithms (including the accurate integer forward DCT and h2v2 &
+h2v1 downsampling algorithms, which are not accelerated in the 32-bit NEON
+implementation.)  This speeds up the compression of full-color JPEGs by about
+75% on average on a Cavium ThunderX processor and by about 2-2.5x on average on
+Cortex-A53 and Cortex-A57 cores.
+
+14. Added SIMD acceleration for Huffman encoding on NEON-capable ARM 32-bit
+and 64-bit platforms.
+
+    For 32-bit code, this speeds up the compression of full-color JPEGs by
+about 30% on average on a typical iOS device (iPhone 4S, Cortex-A9) and by
+about 6-7% on average on a typical Android device (Nexus 5X, Cortex-A53 and
+Cortex-A57), relative to libjpeg-turbo 1.4.x.  Note that the larger speedup
+under iOS is due to the fact that iOS builds use LLVM, which does not optimize
+the C Huffman encoder as well as GCC does.
+
+    For 64-bit code, NEON-accelerated Huffman encoding speeds up the
+compression of full-color JPEGs by about 40% on average on a typical iOS device
+(iPhone 5S, Apple A7) and by about 7-8% on average on a typical Android device
+(Nexus 5X, Cortex-A53 and Cortex-A57), in addition to the speedup described in
+[13] above.
+
+    For the purposes of benchmarking or regression testing, SIMD-accelerated
+Huffman encoding can be disabled by setting the `JSIMD_NOHUFFENC` environment
+variable to `1`.
+
+15. pkg-config (.pc) scripts are now included for both the libjpeg and
+TurboJPEG API libraries on Un*x systems.  Note that if a project's build system
+relies on these scripts, then it will not be possible to build that project
+with libjpeg or with a prior version of libjpeg-turbo.
+
+16. Optimized the ARM 64-bit (ARMv8) NEON SIMD decompression routines to
+improve performance on CPUs with in-order pipelines.  This speeds up the
+decompression of full-color JPEGs by nearly 2x on average on a Cavium ThunderX
+processor and by about 15% on average on a Cortex-A53 core.
+
+17. Fixed an issue in the accelerated Huffman decoder that could have caused
+the decoder to read past the end of the input buffer when a malformed,
+specially-crafted JPEG image was being decompressed.  In prior versions of
+libjpeg-turbo, the accelerated Huffman decoder was invoked (in most cases) only
+if there were > 128 bytes of data in the input buffer.  However, it is possible
+to construct a JPEG image in which a single Huffman block is over 430 bytes
+long, so this version of libjpeg-turbo activates the accelerated Huffman
+decoder only if there are > 512 bytes of data in the input buffer.
+
+18. Fixed a memory leak in tjunittest encountered when running the program
+with the `-yuv` option.
+
+
+1.4.2
+=====
+
+### Significant changes relative to 1.4.1:
+
+1. Fixed an issue whereby cjpeg would segfault if a Windows bitmap with a
+negative width or height was used as an input image (Windows bitmaps can have
+a negative height if they are stored in top-down order, but such files are
+rare and not supported by libjpeg-turbo.)
+
+2. Fixed an issue whereby, under certain circumstances, libjpeg-turbo would
+incorrectly encode certain JPEG images when quality=100 and the fast integer
+forward DCT were used.  This was known to cause `make test` to fail when the
+library was built with `-march=haswell` on x86 systems.
+
+3. Fixed an issue whereby libjpeg-turbo would crash when built with the latest
+& greatest development version of the Clang/LLVM compiler.  This was caused by
+an x86-64 ABI conformance issue in some of libjpeg-turbo's 64-bit SSE2 SIMD
+routines.  Those routines were incorrectly using a 64-bit `mov` instruction to
+transfer a 32-bit JDIMENSION argument, whereas the x86-64 ABI allows the upper
+(unused) 32 bits of a 32-bit argument's register to be undefined.  The new
+Clang/LLVM optimizer uses load combining to transfer multiple adjacent 32-bit
+structure members into a single 64-bit register, and this exposed the ABI
+conformance issue.
+
+4. Fixed a bug in the MIPS DSPr2 4:2:0 "plain" (non-fancy and non-merged)
+upsampling routine that caused a buffer overflow (and subsequent segfault) when
+decompressing a 4:2:0 JPEG image whose scaled output width was less than 16
+pixels.  The "plain" upsampling routines are normally only used when
+decompressing a non-YCbCr JPEG image, but they are also used when decompressing
+a JPEG image whose scaled output height is 1.
+
+5. Fixed various negative left shifts and other issues reported by the GCC and
+Clang undefined behavior sanitizers.  None of these was known to pose a
+security threat, but removing the warnings makes it easier to detect actual
+security issues, should they arise in the future.
+
+
+1.4.1
+=====
+
+### Significant changes relative to 1.4.0:
+
+1. tjbench now properly handles CMYK/YCCK JPEG files.  Passing an argument of
+`-cmyk` (instead of, for instance, `-rgb`) will cause tjbench to internally
+convert the source bitmap to CMYK prior to compression, to generate YCCK JPEG
+files, and to internally convert the decompressed CMYK pixels back to RGB after
+decompression (the latter is done automatically if a CMYK or YCCK JPEG is
+passed to tjbench as a source image.)  The CMYK<->RGB conversion operation is
+not benchmarked.  NOTE: The quick & dirty CMYK<->RGB conversions that tjbench
+uses are suitable for testing only.  Proper conversion between CMYK and RGB
+requires a color management system.
+
+2. `make test` now performs additional bitwise regression tests using tjbench,
+mainly for the purpose of testing compression from/decompression to a subregion
+of a larger image buffer.
+
+3. `make test` no longer tests the regression of the floating point DCT/IDCT
+by default, since the results of those tests can vary if the algorithms in
+question are not implemented using SIMD instructions on a particular platform.
+See the comments in [Makefile.am](Makefile.am) for information on how to
+re-enable the tests and to specify an expected result for them based on the
+particulars of your platform.
+
+4. The NULL color conversion routines have been significantly optimized,
+which speeds up the compression of RGB and CMYK JPEGs by 5-20% when using
+64-bit code and 0-3% when using 32-bit code, and the decompression of those
+images by 10-30% when using 64-bit code and 3-12% when using 32-bit code.
+
+5. Fixed an "illegal instruction" error that occurred when djpeg from a
+SIMD-enabled libjpeg-turbo MIPS build was executed with the `-nosmooth` option
+on a MIPS machine that lacked DSPr2 support.  The MIPS SIMD routines for h2v1
+and h2v2 merged upsampling were not properly checking for the existence of
+DSPr2.
+
+6. Performance has been improved significantly on 64-bit non-Linux and
+non-Windows platforms (generally 10-20% faster compression and 5-10% faster
+decompression.)  Due to an oversight, the 64-bit version of the accelerated
+Huffman codec was not being compiled in when libjpeg-turbo was built on
+platforms other than Windows or Linux.  Oops.
+
+7. Fixed an extremely rare bug in the Huffman encoder that caused 64-bit
+builds of libjpeg-turbo to incorrectly encode a few specific test images when
+quality=98, an optimized Huffman table, and the accurate integer forward DCT
+were used.
+
+8. The Windows (CMake) build system now supports building only static or only
+shared libraries.  This is accomplished by adding either `-DENABLE_STATIC=0` or
+`-DENABLE_SHARED=0` to the CMake command line.
+
+9. TurboJPEG API functions will now return an error code if a warning is
+triggered in the underlying libjpeg API.  For instance, if a JPEG file is
+corrupt, the TurboJPEG decompression functions will attempt to decompress
+as much of the image as possible, but those functions will now return -1 to
+indicate that the decompression was not entirely successful.
+
+10. Fixed a bug in the MIPS DSPr2 4:2:2 fancy upsampling routine that caused a
+buffer overflow (and subsequent segfault) when decompressing a 4:2:2 JPEG image
+in which the right-most MCU was 5 or 6 pixels wide.
+
+
+1.4.0
+=====
+
+### Significant changes relative to 1.4 beta1:
+
+1. Fixed a build issue on OS X PowerPC platforms (md5cmp failed to build
+because OS X does not provide the `le32toh()` and `htole32()` functions.)
+
+2. The non-SIMD RGB565 color conversion code did not work correctly on big
+endian machines.  This has been fixed.
+
+3. Fixed an issue in `tjPlaneSizeYUV()` whereby it would erroneously return 1
+instead of -1 if `componentID` was > 0 and `subsamp` was `TJSAMP_GRAY`.
+
+3. Fixed an issue in `tjBufSizeYUV2()` whereby it would erroneously return 0
+instead of -1 if `width` was < 1.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM64 platforms (see 1.4 beta1[5].)
+
+6. The `close()` method in the TJCompressor and TJDecompressor Java classes is
+now idempotent.  Previously, that method would call the native `tjDestroy()`
+function even if the TurboJPEG instance had already been destroyed.  This
+caused an exception to be thrown during finalization, if the `close()` method
+had already been called.  The exception was caught, but it was still an
+expensive operation.
+
+7. The TurboJPEG API previously generated an error (`Could not determine
+subsampling type for JPEG image`) when attempting to decompress grayscale JPEG
+images that were compressed with a sampling factor other than 1 (for instance,
+with `cjpeg -grayscale -sample 2x2`).  Subsampling technically has no meaning
+with grayscale JPEGs, and thus the horizontal and vertical sampling factors
+for such images are ignored by the decompressor.  However, the TurboJPEG API
+was being too rigid and was expecting the sampling factors to be equal to 1
+before it treated the image as a grayscale JPEG.
+
+8. cjpeg, djpeg, and jpegtran now accept an argument of `-version`, which will
+print the library version and exit.
+
+9. Referring to 1.4 beta1[15], another extremely rare circumstance was
+discovered under which the Huffman encoder's local buffer can be overrun
+when a buffered destination manager is being used and an
+extremely-high-frequency block (basically junk image data) is being encoded.
+Even though the Huffman local buffer was increased from 128 bytes to 136 bytes
+to address the previous issue, the new issue caused even the larger buffer to
+be overrun.  Further analysis reveals that, in the absolute worst case (such as
+setting alternating AC coefficients to 32767 and -32768 in the JPEG scanning
+order), the Huffman encoder can produce encoded blocks that approach double the
+size of the unencoded blocks.  Thus, the Huffman local buffer was increased to
+256 bytes, which should prevent any such issue from re-occurring in the future.
+
+10. The new `tjPlaneSizeYUV()`, `tjPlaneWidth()`, and `tjPlaneHeight()`
+functions were not actually usable on any platform except OS X and Windows,
+because those functions were not included in the libturbojpeg mapfile.  This
+has been fixed.
+
+11. Restored the `JPP()`, `JMETHOD()`, and `FAR` macros in the libjpeg-turbo
+header files.  The `JPP()` and `JMETHOD()` macros were originally implemented
+in libjpeg as a way of supporting non-ANSI compilers that lacked support for
+prototype parameters.  libjpeg-turbo has never supported such compilers, but
+some software packages still use the macros to define their own prototypes.
+Similarly, libjpeg-turbo has never supported MS-DOS and other platforms that
+have far symbols, but some software packages still use the `FAR` macro.  A
+pretty good argument can be made that this is a bad practice on the part of the
+software in question, but since this affects more than one package, it's just
+easier to fix it here.
+
+12. Fixed issues that were preventing the ARM 64-bit SIMD code from compiling
+for iOS, and included an ARMv8 architecture in all of the binaries installed by
+the "official" libjpeg-turbo SDK for OS X.
+
+
+1.3.90 (1.4 beta1)
+==================
+
+### Significant changes relative to 1.3.1:
+
+1. New features in the TurboJPEG API:
+
+     - YUV planar images can now be generated with an arbitrary line padding
+(previously only 4-byte padding, which was compatible with X Video, was
+supported.)
+     - The decompress-to-YUV function has been extended to support image
+scaling.
+     - JPEG images can now be compressed from YUV planar source images.
+     - YUV planar images can now be decoded into RGB or grayscale images.
+     - 4:1:1 subsampling is now supported.  This is mainly included for
+compatibility, since 4:1:1 is not fully accelerated in libjpeg-turbo and has no
+significant advantages relative to 4:2:0.
+     - CMYK images are now supported.  This feature allows CMYK source images
+to be compressed to YCCK JPEGs and YCCK or CMYK JPEGs to be decompressed to
+CMYK destination images.  Conversion between CMYK/YCCK and RGB or YUV images is
+not supported.  Such conversion requires a color management system and is thus
+out of scope for a codec library.
+     - The handling of YUV images in the Java API has been significantly
+refactored and should now be much more intuitive.
+     - The Java API now supports encoding a YUV image from an arbitrary
+position in a large image buffer.
+     - All of the YUV functions now have a corresponding function that operates
+on separate image planes instead of a unified image buffer.  This allows for
+compressing/decoding from or decompressing/encoding to a subregion of a larger
+YUV image.  It also allows for handling YUV formats that swap the order of the
+U and V planes.
+
+2. Added SIMD acceleration for DSPr2-capable MIPS platforms.  This speeds up
+the compression of full-color JPEGs by 70-80% on such platforms and
+decompression by 25-35%.
+
+3. If an application attempts to decompress a Huffman-coded JPEG image whose
+header does not contain Huffman tables, libjpeg-turbo will now insert the
+default Huffman tables.  In order to save space, many motion JPEG video frames
+are encoded without the default Huffman tables, so these frames can now be
+successfully decompressed by libjpeg-turbo without additional work on the part
+of the application.  An application can still override the Huffman tables, for
+instance to re-use tables from a previous frame of the same video.
+
+4. The Mac packaging system now uses pkgbuild and productbuild rather than
+PackageMaker (which is obsolete and no longer supported.)  This means that
+OS X 10.6 "Snow Leopard" or later must be used when packaging libjpeg-turbo,
+although the packages produced can be installed on OS X 10.5 "Leopard" or
+later.  OS X 10.4 "Tiger" is no longer supported.
+
+5. The Huffman encoder now uses `clz` and `bsr` instructions for bit counting
+on ARM platforms rather than a lookup table.  This reduces the memory footprint
+by 64k, which may be important for some mobile applications.  Out of four
+Android devices that were tested, two demonstrated a small overall performance
+loss (~3-4% on average) with ARMv6 code and a small gain (also ~3-4%) with
+ARMv7 code when enabling this new feature, but the other two devices
+demonstrated a significant overall performance gain with both ARMv6 and ARMv7
+code (~10-20%) when enabling the feature.  Actual mileage may vary.
+
+6. Worked around an issue with Visual C++ 2010 and later that caused incorrect
+pixels to be generated when decompressing a JPEG image to a 256-color bitmap,
+if compiler optimization was enabled when libjpeg-turbo was built.  This caused
+the regression tests to fail when doing a release build under Visual C++ 2010
+and later.
+
+7. Improved the accuracy and performance of the non-SIMD implementation of the
+floating point inverse DCT (using code borrowed from libjpeg v8a and later.)
+The accuracy of this implementation now matches the accuracy of the SSE/SSE2
+implementation.  Note, however, that the floating point DCT/IDCT algorithms are
+mainly a legacy feature.  They generally do not produce significantly better
+accuracy than the accurate integer DCT/IDCT algorithms, and they are quite a
+bit slower.
+
+8. Added a new output colorspace (`JCS_RGB565`) to the libjpeg API that allows
+for decompressing JPEG images into RGB565 (16-bit) pixels.  If dithering is not
+used, then this code path is SIMD-accelerated on ARM platforms.
+
+9. Numerous obsolete features, such as support for non-ANSI compilers and
+support for the MS-DOS memory model, were removed from the libjpeg code,
+greatly improving its readability and making it easier to maintain and extend.
+
+10. Fixed a segfault that occurred when calling `output_message()` with
+`msg_code` set to `JMSG_COPYRIGHT`.
+
+11. Fixed an issue whereby wrjpgcom was allowing comments longer than 65k
+characters to be passed on the command line, which was causing it to generate
+incorrect JPEG files.
+
+12. Fixed a bug in the build system that was causing the Windows version of
+wrjpgcom to be built using the rdjpgcom source code.
+
+13. Restored 12-bit-per-component JPEG support.  A 12-bit version of
+libjpeg-turbo can now be built by passing an argument of `--with-12bit` to
+configure (Unix) or `-DWITH_12BIT=1` to cmake (Windows.)  12-bit JPEG support
+is included only for convenience.  Enabling this feature disables all of the
+performance features in libjpeg-turbo, as well as arithmetic coding and the
+TurboJPEG API.  The resulting library still contains the other libjpeg-turbo
+features (such as the colorspace extensions), but in general, it performs no
+faster than libjpeg v6b.
+
+14. Added ARM 64-bit SIMD acceleration for the YCC-to-RGB color conversion
+and IDCT algorithms (both are used during JPEG decompression.)  For unknown
+reasons (probably related to clang), this code cannot currently be compiled for
+iOS.
+
+15. Fixed an extremely rare bug (CVE-2014-9092) that could cause the Huffman
+encoder's local buffer to overrun when a very high-frequency MCU is compressed
+using quality 100 and no subsampling, and when the JPEG output buffer is being
+dynamically resized by the destination manager.  This issue was so rare that,
+even with a test program specifically designed to make the bug occur (by
+injecting random high-frequency YUV data into the compressor), it was
+reproducible only once in about every 25 million iterations.
+
+16. Fixed an oversight in the TurboJPEG C wrapper:  if any of the JPEG
+compression functions was called repeatedly with the same
+automatically-allocated destination buffer, then TurboJPEG would erroneously
+assume that the `jpegSize` parameter was equal to the size of the buffer, when
+in fact that parameter was probably equal to the size of the most recently
+compressed JPEG image.  If the size of the previous JPEG image was not as large
+as the current JPEG image, then TurboJPEG would unnecessarily reallocate the
+destination buffer.
+
+
+1.3.1
+=====
+
+### Significant changes relative to 1.3.0:
+
+1. On Un*x systems, `make install` now installs the libjpeg-turbo libraries
+into /opt/libjpeg-turbo/lib32 by default on any 32-bit system, not just x86,
+and into /opt/libjpeg-turbo/lib64 by default on any 64-bit system, not just
+x86-64.  You can override this by overriding either the `prefix` or `libdir`
+configure variables.
+
+2. The Windows installer now places a copy of the TurboJPEG DLLs in the same
+directory as the rest of the libjpeg-turbo binaries.  This was mainly done
+to support TurboVNC 1.3, which bundles the DLLs in its Windows installation.
+When using a 32-bit version of CMake on 64-bit Windows, it is impossible to
+access the c:\WINDOWS\system32 directory, which made it impossible for the
+TurboVNC build scripts to bundle the 64-bit TurboJPEG DLL.
+
+3. Fixed a bug whereby attempting to encode a progressive JPEG with arithmetic
+entropy coding (by passing arguments of `-progressive -arithmetic` to cjpeg or
+jpegtran, for instance) would result in an error, `Requested feature was
+omitted at compile time`.
+
+4. Fixed a couple of issues (CVE-2013-6629 and CVE-2013-6630) whereby malformed
+JPEG images would cause libjpeg-turbo to use uninitialized memory during
+decompression.
+
+5. Fixed an error (`Buffer passed to JPEG library is too small`) that occurred
+when calling the TurboJPEG YUV encoding function with a very small (< 5x5)
+source image, and added a unit test to check for this error.
+
+6. The Java classes should now build properly under Visual Studio 2010 and
+later.
+
+7. Fixed an issue that prevented SRPMs generated using the in-tree packaging
+tools from being rebuilt on certain newer Linux distributions.
+
+8. Numerous minor fixes to eliminate compilation and build/packaging system
+warnings, fix cosmetic issues, improve documentation clarity, and other general
+source cleanup.
+
+
+1.3.0
+=====
+
+### Significant changes relative to 1.3 beta1:
+
+1. `make test` now works properly on FreeBSD, and it no longer requires the
+md5sum executable to be present on other Un*x platforms.
+
+2. Overhauled the packaging system:
+
+     - To avoid conflict with vendor-supplied libjpeg-turbo packages, the
+official RPMs and DEBs for libjpeg-turbo have been renamed to
+"libjpeg-turbo-official".
+     - The TurboJPEG libraries are now located under /opt/libjpeg-turbo in the
+official Linux and Mac packages, to avoid conflict with vendor-supplied
+packages and also to streamline the packaging system.
+     - Release packages are now created with the directory structure defined
+by the configure variables `prefix`, `bindir`, `libdir`, etc. (Un\*x) or by the
+`CMAKE_INSTALL_PREFIX` variable (Windows.)  The exception is that the docs are
+always located under the system default documentation directory on Un\*x and
+Mac systems, and on Windows, the TurboJPEG DLL is always located in the Windows
+system directory.
+     - To avoid confusion, official libjpeg-turbo packages on Linux/Unix
+platforms (except for Mac) will always install the 32-bit libraries in
+/opt/libjpeg-turbo/lib32 and the 64-bit libraries in /opt/libjpeg-turbo/lib64.
+     - Fixed an issue whereby, in some cases, the libjpeg-turbo executables on
+Un*x systems were not properly linking with the shared libraries installed by
+the same package.
+     - Fixed an issue whereby building the "installer" target on Windows when
+`WITH_JAVA=1` would fail if the TurboJPEG JAR had not been previously built.
+     - Building the "install" target on Windows now installs files into the
+same places that the installer does.
+
+3. Fixed a Huffman encoder bug that prevented I/O suspension from working
+properly.
+
+
+1.2.90 (1.3 beta1)
+==================
+
+### Significant changes relative to 1.2.1:
+
+1. Added support for additional scaling factors (3/8, 5/8, 3/4, 7/8, 9/8, 5/4,
+11/8, 3/2, 13/8, 7/4, 15/8, and 2) when decompressing.  Note that the IDCT will
+not be SIMD-accelerated when using any of these new scaling factors.
+
+2. The TurboJPEG dynamic library is now versioned.  It was not strictly
+necessary to do so, because TurboJPEG uses versioned symbols, and if a function
+changes in an ABI-incompatible way, that function is renamed and a legacy
+function is provided to maintain backward compatibility.  However, certain
+Linux distro maintainers have a policy against accepting any library that isn't
+versioned.
+
+3. Extended the TurboJPEG Java API so that it can be used to compress a JPEG
+image from and decompress a JPEG image to an arbitrary position in a large
+image buffer.
+
+4. The `tjDecompressToYUV()` function now supports the `TJFLAG_FASTDCT` flag.
+
+5. The 32-bit supplementary package for amd64 Debian systems now provides
+symlinks in /usr/lib/i386-linux-gnu for the TurboJPEG libraries in /usr/lib32.
+This allows those libraries to be used on MultiArch-compatible systems (such as
+Ubuntu 11 and later) without setting the linker path.
+
+6. The TurboJPEG Java wrapper should now find the JNI library on Mac systems
+without having to pass `-Djava.library.path=/usr/lib` to java.
+
+7. TJBench has been ported to Java to provide a convenient way of validating
+the performance of the TurboJPEG Java API.  It can be run with
+`java -cp turbojpeg.jar TJBench`.
+
+8. cjpeg can now be used to generate JPEG files with the RGB colorspace
+(feature ported from jpeg-8d.)
+
+9. The width and height in the `-crop` argument passed to jpegtran can now be
+suffixed with `f` to indicate that, when the upper left corner of the cropping
+region is automatically moved to the nearest iMCU boundary, the bottom right
+corner should be moved by the same amount.  In other words, this feature causes
+jpegtran to strictly honor the specified width/height rather than the specified
+bottom right corner (feature ported from jpeg-8d.)
+
+10. JPEG files using the RGB colorspace can now be decompressed into grayscale
+images (feature ported from jpeg-8d.)
+
+11. Fixed a regression caused by 1.2.1[7] whereby the build would fail with
+multiple "Mismatch in operand sizes" errors when attempting to build the x86
+SIMD code with NASM 0.98.
+
+12. The in-memory source/destination managers (`jpeg_mem_src()` and
+`jpeg_mem_dest()`) are now included by default when building libjpeg-turbo with
+libjpeg v6b or v7 emulation, so that programs can take advantage of these
+functions without requiring the use of the backward-incompatible libjpeg v8
+ABI.  The "age number" of the libjpeg-turbo library on Un*x systems has been
+incremented by 1 to reflect this.  You can disable this feature with a
+configure/CMake switch in order to retain strict API/ABI compatibility with the
+libjpeg v6b or v7 API/ABI (or with previous versions of libjpeg-turbo.)  See
+[README.md](README.md) for more details.
+
+13. Added ARMv7s architecture to libjpeg.a and libturbojpeg.a in the official
+libjpeg-turbo binary package for OS X, so that those libraries can be used to
+build applications that leverage the faster CPUs in the iPhone 5 and iPad 4.
+
+
+1.2.1
+=====
+
+### Significant changes relative to 1.2.0:
+
+1. Creating or decoding a JPEG file that uses the RGB colorspace should now
+properly work when the input or output colorspace is one of the libjpeg-turbo
+colorspace extensions.
+
+2. When libjpeg-turbo was built without SIMD support and merged (non-fancy)
+upsampling was used along with an alpha-enabled colorspace during
+decompression, the unused byte of the decompressed pixels was not being set to
+0xFF.  This has been fixed.  TJUnitTest has also been extended to test for the
+correct behavior of the colorspace extensions when merged upsampling is used.
+
+3. Fixed a bug whereby the libjpeg-turbo SSE2 SIMD code would not preserve the
+upper 64 bits of xmm6 and xmm7 on Win64 platforms, which violated the Win64
+calling conventions.
+
+4. Fixed a regression (CVE-2012-2806) caused by 1.2.0[6] whereby decompressing
+corrupt JPEG images (specifically, images in which the component count was
+erroneously set to a large value) would cause libjpeg-turbo to segfault.
+
+5. Worked around a severe performance issue with "Bobcat" (AMD Embedded APU)
+processors.  The `MASKMOVDQU` instruction, which was used by the libjpeg-turbo
+SSE2 SIMD code, is apparently implemented in microcode on AMD processors, and
+it is painfully slow on Bobcat processors in particular.  Eliminating the use
+of this instruction improved performance by an order of magnitude on Bobcat
+processors and by a small amount (typically 5%) on AMD desktop processors.
+
+6. Added SIMD acceleration for performing 4:2:2 upsampling on NEON-capable ARM
+platforms.  This speeds up the decompression of 4:2:2 JPEGs by 20-25% on such
+platforms.
+
+7. Fixed a regression caused by 1.2.0[2] whereby, on Linux/x86 platforms
+running the 32-bit SSE2 SIMD code in libjpeg-turbo, decompressing a 4:2:0 or
+4:2:2 JPEG image into a 32-bit (RGBX, BGRX, etc.) buffer without using fancy
+upsampling would produce several incorrect columns of pixels at the right-hand
+side of the output image if each row in the output image was not evenly
+divisible by 16 bytes.
+
+8. Fixed an issue whereby attempting to build the SIMD extensions with Xcode
+4.3 on OS X platforms would cause NASM to return numerous errors of the form
+"'%define' expects a macro identifier".
+
+9. Added flags to the TurboJPEG API that allow the caller to force the use of
+either the fast or the accurate DCT/IDCT algorithms in the underlying codec.
+
+
+1.2.0
+=====
+
+### Significant changes relative to 1.2 beta1:
+
+1. Fixed build issue with Yasm on Unix systems (the libjpeg-turbo build system
+was not adding the current directory to the assembler include path, so Yasm
+was not able to find jsimdcfg.inc.)
+
+2. Fixed out-of-bounds read in SSE2 SIMD code that occurred when decompressing
+a JPEG image to a bitmap buffer whose size was not a multiple of 16 bytes.
+This was more of an annoyance than an actual bug, since it did not cause any
+actual run-time problems, but the issue showed up when running libjpeg-turbo in
+valgrind.  See <http://crbug.com/72399> for more information.
+
+3. Added a compile-time macro (`LIBJPEG_TURBO_VERSION`) that can be used to
+check the version of libjpeg-turbo against which an application was compiled.
+
+4. Added new RGBA/BGRA/ABGR/ARGB colorspace extension constants (libjpeg API)
+and pixel formats (TurboJPEG API), which allow applications to specify that,
+when decompressing to a 4-component RGB buffer, the unused byte should be set
+to 0xFF so that it can be interpreted as an opaque alpha channel.
+
+5. Fixed regression issue whereby DevIL failed to build against libjpeg-turbo
+because libjpeg-turbo's distributed version of jconfig.h contained an `INLINE`
+macro, which conflicted with a similar macro in DevIL.  This macro is used only
+internally when building libjpeg-turbo, so it was moved into config.h.
+
+6. libjpeg-turbo will now correctly decompress erroneous CMYK/YCCK JPEGs whose
+K component is assigned a component ID of 1 instead of 4.  Although these files
+are in violation of the spec, other JPEG implementations handle them
+correctly.
+
+7. Added ARMv6 and ARMv7 architectures to libjpeg.a and libturbojpeg.a in
+the official libjpeg-turbo binary package for OS X, so that those libraries can
+be used to build both OS X and iOS applications.
+
+
+1.1.90 (1.2 beta1)
+==================
+
+### Significant changes relative to 1.1.1:
+
+1. Added a Java wrapper for the TurboJPEG API.  See [java/README](java/README)
+for more details.
+
+2. The TurboJPEG API can now be used to scale down images during
+decompression.
+
+3. Added SIMD routines for RGB-to-grayscale color conversion, which
+significantly improves the performance of grayscale JPEG compression from an
+RGB source image.
+
+4. Improved the performance of the C color conversion routines, which are used
+on platforms for which SIMD acceleration is not available.
+
+5. Added a function to the TurboJPEG API that performs lossless transforms.
+This function is implemented using the same back end as jpegtran, but it
+performs transcoding entirely in memory and allows multiple transforms and/or
+crop operations to be batched together, so the source coefficients only need to
+be read once.  This is useful when generating image tiles from a single source
+JPEG.
+
+6. Added tests for the new TurboJPEG scaled decompression and lossless
+transform features to tjbench (the TurboJPEG benchmark, formerly called
+"jpgtest".)
+
+7. Added support for 4:4:0 (transposed 4:2:2) subsampling in TurboJPEG, which
+was necessary in order for it to read 4:2:2 JPEG files that had been losslessly
+transposed or rotated 90 degrees.
+
+8. All legacy VirtualGL code has been re-factored, and this has allowed
+libjpeg-turbo, in its entirety, to be re-licensed under a BSD-style license.
+
+9. libjpeg-turbo can now be built with Yasm.
+
+10. Added SIMD acceleration for ARM Linux and iOS platforms that support
+NEON instructions.
+
+11. Refactored the TurboJPEG C API and documented it using Doxygen.  The
+TurboJPEG 1.2 API uses pixel formats to define the size and component order of
+the uncompressed source/destination images, and it includes a more efficient
+version of `TJBUFSIZE()` that computes a worst-case JPEG size based on the
+level of chrominance subsampling.  The refactored implementation of the
+TurboJPEG API now uses the libjpeg memory source and destination managers,
+which allows the TurboJPEG compressor to grow the JPEG buffer as necessary.
+
+12. Eliminated errors in the output of jpegtran on Windows that occurred when
+the application was invoked using I/O redirection
+(`jpegtran <input.jpg >output.jpg`.)
+
+13. The inclusion of libjpeg v7 and v8 emulation as well as arithmetic coding
+support in libjpeg-turbo v1.1.0 introduced several new error constants in
+jerror.h, and these were mistakenly enabled for all emulation modes, causing
+the error enum in libjpeg-turbo to sometimes have different values than the
+same enum in libjpeg.  This represents an ABI incompatibility, and it caused
+problems with rare applications that took specific action based on a particular
+error value.  The fix was to include the new error constants conditionally
+based on whether libjpeg v7 or v8 emulation was enabled.
+
+14. Fixed an issue whereby Windows applications that used libjpeg-turbo would
+fail to compile if the Windows system headers were included before jpeglib.h.
+This issue was caused by a conflict in the definition of the INT32 type.
+
+15. Fixed 32-bit supplementary package for amd64 Debian systems, which was
+broken by enhancements to the packaging system in 1.1.
+
+16. When decompressing a JPEG image using an output colorspace of
+`JCS_EXT_RGBX`, `JCS_EXT_BGRX`, `JCS_EXT_XBGR`, or `JCS_EXT_XRGB`,
+libjpeg-turbo will now set the unused byte to 0xFF, which allows applications
+to interpret that byte as an alpha channel (0xFF = opaque).
+
+
+1.1.1
+=====
+
+### Significant changes relative to 1.1.0:
+
+1. Fixed a 1-pixel error in row 0, column 21 of the luminance plane generated
+by `tjEncodeYUV()`.
+
+2. libjpeg-turbo's accelerated Huffman decoder previously ignored unexpected
+markers found in the middle of the JPEG data stream during decompression.  It
+will now hand off decoding of a particular block to the unaccelerated Huffman
+decoder if an unexpected marker is found, so that the unaccelerated Huffman
+decoder can generate an appropriate warning.
+
+3. Older versions of MinGW64 prefixed symbol names with underscores by
+default, which differed from the behavior of 64-bit Visual C++.  MinGW64 1.0
+has adopted the behavior of 64-bit Visual C++ as the default, so to accommodate
+this, the libjpeg-turbo SIMD function names are no longer prefixed with an
+underscore when building with MinGW64.  This means that, when building
+libjpeg-turbo with older versions of MinGW64, you will now have to add
+`-fno-leading-underscore` to the `CFLAGS`.
+
+4. Fixed a regression bug in the NSIS script that caused the Windows installer
+build to fail when using the Visual Studio IDE.
+
+5. Fixed a bug in `jpeg_read_coefficients()` whereby it would not initialize
+`cinfo->image_width` and `cinfo->image_height` if libjpeg v7 or v8 emulation
+was enabled.  This specifically caused the jpegoptim program to fail if it was
+linked against a version of libjpeg-turbo that was built with libjpeg v7 or v8
+emulation.
+
+6. Eliminated excessive I/O overhead that occurred when reading BMP files in
+cjpeg.
+
+7. Eliminated errors in the output of cjpeg on Windows that occurred when the
+application was invoked using I/O redirection (`cjpeg <inputfile >output.jpg`.)
+
+
+1.1.0
+=====
+
+### Significant changes relative to 1.1 beta1:
+
+1. The algorithm used by the SIMD quantization function cannot produce correct
+results when the JPEG quality is >= 98 and the fast integer forward DCT is
+used.  Thus, the non-SIMD quantization function is now used for those cases,
+and libjpeg-turbo should now produce identical output to libjpeg v6b in all
+cases.
+
+2. Despite the above, the fast integer forward DCT still degrades somewhat for
+JPEG qualities greater than 95, so the TurboJPEG wrapper will now automatically
+use the accurate integer forward DCT when generating JPEG images of quality 96
+or greater.  This reduces compression performance by as much as 15% for these
+high-quality images but is necessary to ensure that the images are perceptually
+lossless.  It also ensures that the library can avoid the performance pitfall
+created by [1].
+
+3. Ported jpgtest.cxx to pure C to avoid the need for a C++ compiler.
+
+4. Fixed visual artifacts in grayscale JPEG compression caused by a typo in
+the RGB-to-luminance lookup tables.
+
+5. The Windows distribution packages now include the libjpeg run-time programs
+(cjpeg, etc.)
+
+6. All packages now include jpgtest.
+
+7. The TurboJPEG dynamic library now uses versioned symbols.
+
+8. Added two new TurboJPEG API functions, `tjEncodeYUV()` and
+`tjDecompressToYUV()`, to replace the somewhat hackish `TJ_YUV` flag.
+
+
+1.0.90 (1.1 beta1)
+==================
+
+### Significant changes relative to 1.0.1:
+
+1. Added emulation of the libjpeg v7 and v8 APIs and ABIs.  See
+[README.md](README.md) for more details.  This feature was sponsored by
+CamTrace SAS.
+
+2. Created a new CMake-based build system for the Visual C++ and MinGW builds.
+
+3. Grayscale bitmaps can now be compressed from/decompressed to using the
+TurboJPEG API.
+
+4. jpgtest can now be used to test decompression performance with existing
+JPEG images.
+
+5. If the default install prefix (/opt/libjpeg-turbo) is used, then
+`make install` now creates /opt/libjpeg-turbo/lib32 and
+/opt/libjpeg-turbo/lib64 sym links to duplicate the behavior of the binary
+packages.
+
+6. All symbols in the libjpeg-turbo dynamic library are now versioned, even
+when the library is built with libjpeg v6b emulation.
+
+7. Added arithmetic encoding and decoding support (can be disabled with
+configure or CMake options)
+
+8. Added a `TJ_YUV` flag to the TurboJPEG API, which causes both the compressor
+and decompressor to output planar YUV images.
+
+9. Added an extended version of `tjDecompressHeader()` to the TurboJPEG API,
+which allows the caller to determine the type of subsampling used in a JPEG
+image.
+
+10. Added further protections against invalid Huffman codes.
+
+
+1.0.1
+=====
+
+### Significant changes relative to 1.0.0:
+
+1. The Huffman decoder will now handle erroneous Huffman codes (for instance,
+from a corrupt JPEG image.)  Previously, these would cause libjpeg-turbo to
+crash under certain circumstances.
+
+2. Fixed typo in SIMD dispatch routines that was causing 4:2:2 upsampling to
+be used instead of 4:2:0 when decompressing JPEG images using SSE2 code.
+
+3. The configure script will now automatically determine whether the
+`INCOMPLETE_TYPES_BROKEN` macro should be defined.
+
+
+1.0.0
+=====
+
+### Significant changes relative to 0.0.93:
+
+1. 2983700: Further FreeBSD build tweaks (no longer necessary to specify
+`--host` when configuring on a 64-bit system)
+
+2. Created symlinks in the Unix/Linux packages so that the TurboJPEG
+include file can always be found in /opt/libjpeg-turbo/include, the 32-bit
+static libraries can always be found in /opt/libjpeg-turbo/lib32, and the
+64-bit static libraries can always be found in /opt/libjpeg-turbo/lib64.
+
+3. The Unix/Linux distribution packages now include the libjpeg run-time
+programs (cjpeg, etc.) and man pages.
+
+4. Created a 32-bit supplementary package for amd64 Debian systems, which
+contains just the 32-bit libjpeg-turbo libraries.
+
+5. Moved the libraries from */lib32 to */lib in the i386 Debian package.
+
+6. Include distribution package for Cygwin
+
+7. No longer necessary to specify `--without-simd` on non-x86 architectures,
+and unit tests now work on those architectures.
+
+
+0.0.93
+======
+
+### Significant changes since 0.0.91:
+
+1. 2982659: Fixed x86-64 build on FreeBSD systems
+
+2. 2988188: Added support for Windows 64-bit systems
+
+
+0.0.91
+======
+
+### Significant changes relative to 0.0.90:
+
+1. Added documentation to .deb packages
+
+2. 2968313: Fixed data corruption issues when decompressing large JPEG images
+and/or using buffered I/O with the libjpeg-turbo decompressor
+
+
+0.0.90
+======
+
+Initial release
diff --git a/media/libjpeg/jaricom.c b/media/libjpeg/jaricom.c
index 3bb557f7a3..215640cc44 100644
--- a/media/libjpeg/jaricom.c
+++ b/media/libjpeg/jaricom.c
@@ -4,16 +4,16 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains probability estimation tables for common use in
  * arithmetic entropy encoding and decoding routines.
  *
- * This data represents Table D.2 in the JPEG spec (ISO/IEC IS 10918-1
- * and CCITT Recommendation ITU-T T.81) and Table 24 in the JBIG spec
- * (ISO/IEC IS 11544 and CCITT Recommendation ITU-T T.82).
+ * This data represents Table D.2 in
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 and Table 24 in
+ * Recommendation ITU-T T.82 (1993) | ISO/IEC 11544:1993.
  */
 
 #define JPEG_INTERNALS
@@ -29,9 +29,10 @@
  * implementation (jbig_tab.c).
  */
 
-#define V(i,a,b,c,d) (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
+#define V(i, a, b, c, d) \
+  (((JLONG)a << 16) | ((JLONG)c << 8) | ((JLONG)d << 7) | b)
 
-const JLONG jpeg_aritab[113+1] = {
+const JLONG jpeg_aritab[113 + 1] = {
 /*
  * Index, Qe_Value, Next_Index_LPS, Next_Index_MPS, Switch_MPS
  */
diff --git a/media/libjpeg/jcapimin.c b/media/libjpeg/jcapimin.c
index 15674be54a..84e7ecc9a7 100644
--- a/media/libjpeg/jcapimin.c
+++ b/media/libjpeg/jcapimin.c
@@ -4,8 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -31,7 +31,7 @@
  */
 
 GLOBAL(void)
-jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
 {
   int i;
 
@@ -41,7 +41,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
   if (structsize != sizeof(struct jpeg_compress_struct))
     ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
-             (int) sizeof(struct jpeg_compress_struct), (int) structsize);
+             (int)sizeof(struct jpeg_compress_struct), (int)structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -52,14 +52,14 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
   {
     struct jpeg_error_mgr *err = cinfo->err;
     void *client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
+    memset(cinfo, 0, sizeof(struct jpeg_compress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
   cinfo->is_decompressor = FALSE;
 
   /* Initialize a memory manager instance for this object */
-  jinit_memory_mgr((j_common_ptr) cinfo);
+  jinit_memory_mgr((j_common_ptr)cinfo);
 
   /* Zero out pointers to permanent structures. */
   cinfo->progress = NULL;
@@ -83,7 +83,7 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
   /* Must do it here for emit_dqt in case jpeg_write_tables is used */
   cinfo->block_size = DCTSIZE;
   cinfo->natural_order = jpeg_natural_order;
-  cinfo->lim_Se = DCTSIZE2-1;
+  cinfo->lim_Se = DCTSIZE2 - 1;
 #endif
 
   cinfo->script_space = NULL;
@@ -100,9 +100,9 @@ jpeg_CreateCompress (j_compress_ptr cinfo, int version, size_t structsize)
  */
 
 GLOBAL(void)
-jpeg_destroy_compress (j_compress_ptr cinfo)
+jpeg_destroy_compress(j_compress_ptr cinfo)
 {
-  jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+  jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -112,9 +112,9 @@ jpeg_destroy_compress (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_abort_compress (j_compress_ptr cinfo)
+jpeg_abort_compress(j_compress_ptr cinfo)
 {
-  jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+  jpeg_abort((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -131,7 +131,7 @@ jpeg_abort_compress (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
+jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress)
 {
   int i;
   JQUANT_TBL *qtbl;
@@ -159,7 +159,7 @@ jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress)
  */
 
 GLOBAL(void)
-jpeg_finish_compress (j_compress_ptr cinfo)
+jpeg_finish_compress(j_compress_ptr cinfo)
 {
   JDIMENSION iMCU_row;
 
@@ -172,18 +172,18 @@ jpeg_finish_compress (j_compress_ptr cinfo)
   } else if (cinfo->global_state != CSTATE_WRCOEFS)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   /* Perform any remaining passes */
-  while (! cinfo->master->is_last_pass) {
+  while (!cinfo->master->is_last_pass) {
     (*cinfo->master->prepare_for_pass) (cinfo);
     for (iMCU_row = 0; iMCU_row < cinfo->total_iMCU_rows; iMCU_row++) {
       if (cinfo->progress != NULL) {
-        cinfo->progress->pass_counter = (long) iMCU_row;
-        cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows;
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long)iMCU_row;
+        cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       }
       /* We bypass the main controller and invoke coef controller directly;
        * all work is being done from the coefficient buffer.
        */
-      if (! (*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE) NULL))
+      if (!(*cinfo->coef->compress_data) (cinfo, (JSAMPIMAGE)NULL))
         ERREXIT(cinfo, JERR_CANT_SUSPEND);
     }
     (*cinfo->master->finish_pass) (cinfo);
@@ -192,7 +192,7 @@ jpeg_finish_compress (j_compress_ptr cinfo)
   (*cinfo->marker->write_file_trailer) (cinfo);
   (*cinfo->dest->term_destination) (cinfo);
   /* We can use jpeg_abort to release memory and reset global_state */
-  jpeg_abort((j_common_ptr) cinfo);
+  jpeg_abort((j_common_ptr)cinfo);
 }
 
 
@@ -204,8 +204,8 @@ jpeg_finish_compress (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_write_marker (j_compress_ptr cinfo, int marker,
-                   const JOCTET *dataptr, unsigned int datalen)
+jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+                  unsigned int datalen)
 {
   void (*write_marker_byte) (j_compress_ptr info, int val);
 
@@ -226,7 +226,7 @@ jpeg_write_marker (j_compress_ptr cinfo, int marker,
 /* Same, but piecemeal. */
 
 GLOBAL(void)
-jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+jpeg_write_m_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
 {
   if (cinfo->next_scanline != 0 ||
       (cinfo->global_state != CSTATE_SCANNING &&
@@ -238,7 +238,7 @@ jpeg_write_m_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
 }
 
 GLOBAL(void)
-jpeg_write_m_byte (j_compress_ptr cinfo, int val)
+jpeg_write_m_byte(j_compress_ptr cinfo, int val)
 {
   (*cinfo->marker->write_marker_byte) (cinfo, val);
 }
@@ -266,13 +266,13 @@ jpeg_write_m_byte (j_compress_ptr cinfo, int val)
  */
 
 GLOBAL(void)
-jpeg_write_tables (j_compress_ptr cinfo)
+jpeg_write_tables(j_compress_ptr cinfo)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Initialize the marker writer ... bit of a crock to do it here. */
   jinit_marker_writer(cinfo);
diff --git a/media/libjpeg/jcapistd.c b/media/libjpeg/jcapistd.c
index 5c6d0be255..aa2aad9f66 100644
--- a/media/libjpeg/jcapistd.c
+++ b/media/libjpeg/jcapistd.c
@@ -36,7 +36,7 @@
  */
 
 GLOBAL(void)
-jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
+jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -45,7 +45,7 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
     jpeg_suppress_tables(cinfo, FALSE); /* mark all tables to be written */
 
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Perform master selection of active modules */
   jinit_compress_master(cinfo);
@@ -75,8 +75,8 @@ jpeg_start_compress (j_compress_ptr cinfo, boolean write_all_tables)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
-                      JDIMENSION num_lines)
+jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                     JDIMENSION num_lines)
 {
   JDIMENSION row_ctr, rows_left;
 
@@ -87,9 +87,9 @@ jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->next_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->image_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->image_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Give master control module another chance if this is first call to
@@ -118,8 +118,8 @@ jpeg_write_scanlines (j_compress_ptr cinfo, JSAMPARRAY scanlines,
  */
 
 GLOBAL(JDIMENSION)
-jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
-                     JDIMENSION num_lines)
+jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                    JDIMENSION num_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
@@ -132,9 +132,9 @@ jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->next_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->image_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->next_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->image_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Give master control module another chance if this is first call to
@@ -151,7 +151,7 @@ jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Directly compress the row. */
-  if (! (*cinfo->coef->compress_data) (cinfo, data)) {
+  if (!(*cinfo->coef->compress_data) (cinfo, data)) {
     /* If compressor did not consume the whole row, suspend processing. */
     return 0;
   }
diff --git a/media/libjpeg/jcarith.c b/media/libjpeg/jcarith.c
index 6d3b8af5b4..b1720521bf 100644
--- a/media/libjpeg/jcarith.c
+++ b/media/libjpeg/jcarith.c
@@ -4,16 +4,19 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2018, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains portable arithmetic entropy encoding routines for JPEG
- * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
  *
  * Both sequential and progressive modes are supported in this single module.
  *
  * Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -63,8 +66,8 @@ typedef arith_entropy_encoder *arith_entropy_ptr;
  * in the lower bits (mask 0x7F).
  */
 
-#define DC_STAT_BINS 64
-#define AC_STAT_BINS 256
+#define DC_STAT_BINS  64
+#define AC_STAT_BINS  256
 
 /* NOTE: Uncomment the following #define if you want to use the
  * given formula for calculating the AC conditioning parameter Kx
@@ -105,25 +108,25 @@ typedef arith_entropy_encoder *arith_entropy_ptr;
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define ISHIFT_TEMPS    int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-        ((ishift_temp = (x)) < 0 ? \
-         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-         (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
 
 LOCAL(void)
-emit_byte (int val, j_compress_ptr cinfo)
+emit_byte(int val, j_compress_ptr cinfo)
 /* Write next output byte; we do not support suspension in this module. */
 {
   struct jpeg_destination_mgr *dest = cinfo->dest;
 
-  *dest->next_output_byte++ = (JOCTET) val;
+  *dest->next_output_byte++ = (JOCTET)val;
   if (--dest->free_in_buffer == 0)
-    if (! (*dest->empty_output_buffer) (cinfo))
+    if (!(*dest->empty_output_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
 }
 
@@ -133,22 +136,22 @@ emit_byte (int val, j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass (j_compress_ptr cinfo)
+finish_pass(j_compress_ptr cinfo)
 {
-  arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   JLONG temp;
 
   /* Section D.1.8: Termination of encoding */
 
   /* Find the e->c in the coding interval with the largest
    * number of trailing zero bits */
-  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000L) < e->c)
+  if ((temp = (e->a - 1 + e->c) & 0xFFFF0000UL) < e->c)
     e->c = temp + 0x8000L;
   else
     e->c = temp;
   /* Send remaining bytes to output */
   e->c <<= e->ct;
-  if (e->c & 0xF8000000L) {
+  if (e->c & 0xF8000000UL) {
     /* One final overflow has to be handled */
     if (e->buffer >= 0) {
       if (e->zc)
@@ -219,9 +222,9 @@ finish_pass (j_compress_ptr cinfo)
  */
 
 LOCAL(void)
-arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
+arith_encode(j_compress_ptr cinfo, unsigned char *st, int val)
 {
-  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   register unsigned char nl, nm;
   register JLONG qe, temp;
   register int sv;
@@ -231,8 +234,8 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
    */
   sv = *st;
   qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
-  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
-  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
+  nl = qe & 0xFF;  qe >>= 8;    /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF;  qe >>= 8;    /* Next_Index_MPS */
 
   /* Encode & estimation procedures per sections D.1.4 & D.1.5 */
   e->a -= qe;
@@ -319,9 +322,9 @@ arith_encode (j_compress_ptr cinfo, unsigned char *st, int val)
  */
 
 LOCAL(void)
-emit_restart (j_compress_ptr cinfo, int restart_num)
+emit_restart(j_compress_ptr cinfo, int restart_num)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci;
   jpeg_component_info *compptr;
 
@@ -335,14 +338,14 @@ emit_restart (j_compress_ptr cinfo, int restart_num)
     compptr = cinfo->cur_comp_info[ci];
     /* DC needs no table for refinement scan */
     if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
-      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
     /* AC needs no table when not present */
     if (cinfo->progressive_mode == 0 || cinfo->Se) {
-      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+      memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
     }
   }
 
@@ -362,9 +365,9 @@ emit_restart (j_compress_ptr cinfo, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl;
@@ -391,7 +394,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Compute the DC value after the required point transform by Al.
      * This is simply an arithmetic right shift.
      */
-    m = IRIGHT_SHIFT((int) ((*block)[0]), cinfo->Al);
+    m = IRIGHT_SHIFT((int)((*block)[0]), cinfo->Al);
 
     /* Sections F.1.4.1 & F.1.4.4.1: Encoding of DC coefficients */
 
@@ -432,9 +435,9 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;    /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
@@ -453,9 +456,9 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, k, ke;
@@ -510,7 +513,7 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
           break;
         }
       }
-      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
     }
     st += 2;
     /* Figure F.8: Encoding the magnitude category of v */
@@ -552,9 +555,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   unsigned char *st;
   int Al, blkn;
 
@@ -587,9 +590,9 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, k, ke, kex;
@@ -662,7 +665,7 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
           break;
         }
       }
-      arith_encode(cinfo, st + 1, 0); st += 3; k++;
+      arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
     }
   }
   /* Encode EOB decision only if k <= cinfo->Se */
@@ -680,9 +683,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
@@ -747,9 +750,9 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       }
       arith_encode(cinfo, st, 0);
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;    /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] += 8;   /* large diff category */
       /* Figure F.9: Encoding the magnitude bit pattern of v */
       st += 14;
@@ -770,7 +773,7 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
       arith_encode(cinfo, st, 0);       /* EOB decision */
       while ((v = (*block)[jpeg_natural_order[k]]) == 0) {
-        arith_encode(cinfo, st + 1, 0); st += 3; k++;
+        arith_encode(cinfo, st + 1, 0);  st += 3;  k++;
       }
       arith_encode(cinfo, st + 1, 1);
       /* Figure F.6: Encoding nonzero value v */
@@ -822,9 +825,9 @@ encode_mcu (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-start_pass (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci, tbl;
   jpeg_component_info *compptr;
 
@@ -833,7 +836,7 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
      * We are fully adaptive here and need no extra
      * statistics gathering pass!
      */
-    ERREXIT(cinfo, JERR_NOT_COMPILED);
+    ERREXIT(cinfo, JERR_NOTIMPL);
 
   /* We assume jcmaster.c already validated the progressive scan parameters. */
 
@@ -862,9 +865,9 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
-      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
@@ -875,13 +878,14 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
-      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
 #ifdef CALCULATE_SPECTRAL_CONDITIONING
       if (cinfo->progressive_mode)
         /* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
-        cinfo->arith_ac_K[tbl] = cinfo->Ss + ((8 + cinfo->Se - cinfo->Ss) >> 4);
+        cinfo->arith_ac_K[tbl] = cinfo->Ss +
+                                 ((8 + cinfo->Se - cinfo->Ss) >> 4);
 #endif
     }
   }
@@ -905,15 +909,15 @@ start_pass (j_compress_ptr cinfo, boolean gather_statistics)
  */
 
 GLOBAL(void)
-jinit_arith_encoder (j_compress_ptr cinfo)
+jinit_arith_encoder(j_compress_ptr cinfo)
 {
   arith_entropy_ptr entropy;
   int i;
 
   entropy = (arith_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(arith_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass;
   entropy->pub.finish_pass = finish_pass;
 
diff --git a/media/libjpeg/jccoefct.c b/media/libjpeg/jccoefct.c
index a08d6e3230..068232a527 100644
--- a/media/libjpeg/jccoefct.c
+++ b/media/libjpeg/jccoefct.c
@@ -58,21 +58,19 @@ typedef my_coef_controller *my_coef_ptr;
 
 
 /* Forward declarations */
-METHODDEF(boolean) compress_data
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #ifdef FULL_COEF_BUFFER_SUPPORTED
-METHODDEF(boolean) compress_first_pass
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
-METHODDEF(boolean) compress_output
-        (j_compress_ptr cinfo, JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_first_pass(j_compress_ptr cinfo,
+                                       JSAMPIMAGE input_buf);
+METHODDEF(boolean) compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf);
 #endif
 
 
 LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -81,7 +79,7 @@ start_iMCU_row (j_compress_ptr cinfo)
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -97,9 +95,9 @@ start_iMCU_row (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   coef->iMCU_row_num = 0;
   start_iMCU_row(cinfo);
@@ -140,9 +138,9 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(boolean)
-compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_data(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -167,31 +165,33 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       blkn = 0;
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
         compptr = cinfo->cur_comp_info[ci];
-        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                : compptr->last_col_width;
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+                                                  compptr->last_col_width;
         xpos = MCU_col_num * compptr->MCU_sample_width;
         ypos = yoffset * DCTSIZE; /* ypos == (yoffset+yindex) * DCTSIZE */
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
           if (coef->iMCU_row_num < last_iMCU_row ||
-              yoffset+yindex < compptr->last_row_height) {
+              yoffset + yindex < compptr->last_row_height) {
             (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                          input_buf[compptr->component_index],
                                          coef->MCU_buffer[blkn],
-                                         ypos, xpos, (JDIMENSION) blockcnt);
+                                         ypos, xpos, (JDIMENSION)blockcnt);
             if (blockcnt < compptr->MCU_width) {
               /* Create some dummy blocks at the right edge of the image. */
-              jzero_far((void *) coef->MCU_buffer[blkn + blockcnt],
+              jzero_far((void *)coef->MCU_buffer[blkn + blockcnt],
                         (compptr->MCU_width - blockcnt) * sizeof(JBLOCK));
               for (bi = blockcnt; bi < compptr->MCU_width; bi++) {
-                coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn+bi-1][0][0];
+                coef->MCU_buffer[blkn + bi][0][0] =
+                  coef->MCU_buffer[blkn + bi - 1][0][0];
               }
             }
           } else {
             /* Create a row of dummy blocks at the bottom of the image. */
-            jzero_far((void *) coef->MCU_buffer[blkn],
+            jzero_far((void *)coef->MCU_buffer[blkn],
                       compptr->MCU_width * sizeof(JBLOCK));
             for (bi = 0; bi < compptr->MCU_width; bi++) {
-              coef->MCU_buffer[blkn+bi][0][0] = coef->MCU_buffer[blkn-1][0][0];
+              coef->MCU_buffer[blkn + bi][0][0] =
+                coef->MCU_buffer[blkn - 1][0][0];
             }
           }
           blkn += compptr->MCU_width;
@@ -201,7 +201,7 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       /* Try to write the MCU.  In event of a suspension failure, we will
        * re-DCT the MCU on restart (a bit inefficient, could be fixed...)
        */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -242,9 +242,9 @@ compress_data (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 METHODDEF(boolean)
-compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_first_pass(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION blocks_across, MCUs_across, MCUindex;
   int bi, ci, h_samp_factor, block_row, block_rows, ndummy;
@@ -257,21 +257,21 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
        ci++, compptr++) {
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[ci],
+      ((j_common_ptr)cinfo, coef->whole_image[ci],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, TRUE);
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
     /* Count non-dummy DCT block rows in this iMCU row. */
     if (coef->iMCU_row_num < last_iMCU_row)
       block_rows = compptr->v_samp_factor;
     else {
       /* NB: can't use last_row_height here, since may not be set! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
     }
     blocks_across = compptr->width_in_blocks;
     h_samp_factor = compptr->h_samp_factor;
     /* Count number of dummy blocks to be added at the right margin. */
-    ndummy = (int) (blocks_across % h_samp_factor);
+    ndummy = (int)(blocks_across % h_samp_factor);
     if (ndummy > 0)
       ndummy = h_samp_factor - ndummy;
     /* Perform DCT for all non-dummy blocks in this iMCU row.  Each call
@@ -281,12 +281,12 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       thisblockrow = buffer[block_row];
       (*cinfo->fdct->forward_DCT) (cinfo, compptr,
                                    input_buf[ci], thisblockrow,
-                                   (JDIMENSION) (block_row * DCTSIZE),
-                                   (JDIMENSION) 0, blocks_across);
+                                   (JDIMENSION)(block_row * DCTSIZE),
+                                   (JDIMENSION)0, blocks_across);
       if (ndummy > 0) {
         /* Create dummy blocks at the right edge of the image. */
         thisblockrow += blocks_across; /* => first dummy block */
-        jzero_far((void *) thisblockrow, ndummy * sizeof(JBLOCK));
+        jzero_far((void *)thisblockrow, ndummy * sizeof(JBLOCK));
         lastDC = thisblockrow[-1][0];
         for (bi = 0; bi < ndummy; bi++) {
           thisblockrow[bi][0] = lastDC;
@@ -304,11 +304,11 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       for (block_row = block_rows; block_row < compptr->v_samp_factor;
            block_row++) {
         thisblockrow = buffer[block_row];
-        lastblockrow = buffer[block_row-1];
-        jzero_far((void *) thisblockrow,
-                  (size_t) (blocks_across * sizeof(JBLOCK)));
+        lastblockrow = buffer[block_row - 1];
+        jzero_far((void *)thisblockrow,
+                  (size_t)(blocks_across * sizeof(JBLOCK)));
         for (MCUindex = 0; MCUindex < MCUs_across; MCUindex++) {
-          lastDC = lastblockrow[h_samp_factor-1][0];
+          lastDC = lastblockrow[h_samp_factor - 1][0];
           for (bi = 0; bi < h_samp_factor; bi++) {
             thisblockrow[bi][0] = lastDC;
           }
@@ -338,9 +338,9 @@ compress_first_pass (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
@@ -355,9 +355,9 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
   }
 
   /* Loop to process one whole iMCU row */
@@ -371,14 +371,14 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
           for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
             coef->MCU_buffer[blkn++] = buffer_ptr++;
           }
         }
       }
       /* Try to write the MCU. */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -402,14 +402,14 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 GLOBAL(void)
-jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_coef_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_c_coef_controller *)coef;
   coef->pub.start_pass = start_pass_coef;
 
   /* Create the coefficient buffer. */
@@ -423,12 +423,12 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
-         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-                                (long) compptr->h_samp_factor),
-         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                                (long) compptr->v_samp_factor),
-         (JDIMENSION) compptr->v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)compptr->v_samp_factor);
     }
 #else
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -439,7 +439,7 @@ jinit_c_coef_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     int i;
 
     buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
diff --git a/media/libjpeg/jccolext.c b/media/libjpeg/jccolext.c
index 479b320446..303b322ce6 100644
--- a/media/libjpeg/jccolext.c
+++ b/media/libjpeg/jccolext.c
@@ -29,13 +29,13 @@
 
 INLINE
 LOCAL(void)
-rgb_ycc_convert_internal (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                          JDIMENSION output_row, int num_rows)
+rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+                         int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
-  register JLONG * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
   register JDIMENSION col;
@@ -48,9 +48,9 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -58,17 +58,14 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)
-                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-                 >> SCALEBITS);
+      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)
-                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-                 >> SCALEBITS);
+      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -86,13 +83,13 @@ rgb_ycc_convert_internal (j_compress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_gray_convert_internal (j_compress_ptr cinfo,
-                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                           JDIMENSION output_row, int num_rows)
+rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                          JSAMPIMAGE output_buf, JDIMENSION output_row,
+                          int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
-  register JLONG * ctab = cconvert->rgb_ycc_tab;
+  register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
   register JSAMPROW outptr;
   register JDIMENSION col;
@@ -103,14 +100,13 @@ rgb_gray_convert_internal (j_compress_ptr cinfo,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* Y */
-      outptr[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                               ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -123,9 +119,9 @@ rgb_gray_convert_internal (j_compress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                          JDIMENSION output_row, int num_rows)
+rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JSAMPIMAGE output_buf, JDIMENSION output_row,
+                         int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr0, outptr1, outptr2;
@@ -139,9 +135,9 @@ rgb_rgb_convert_internal (j_compress_ptr cinfo,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
-      outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
-      outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
     }
   }
diff --git a/media/libjpeg/jccolor.c b/media/libjpeg/jccolor.c
index b973d101d6..bdc563c723 100644
--- a/media/libjpeg/jccolor.c
+++ b/media/libjpeg/jccolor.c
@@ -63,9 +63,9 @@ typedef my_color_converter *my_cconvert_ptr;
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define CBCR_OFFSET     ((JLONG) CENTERJSAMPLE << SCALEBITS)
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define CBCR_OFFSET     ((JLONG)CENTERJSAMPLE << SCALEBITS)
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 /* We allocate one big table and divide it up into eight parts, instead of
  * doing eight alloc_small requests.  This lets us use a single table base
@@ -74,15 +74,15 @@ typedef my_color_converter *my_cconvert_ptr;
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
-#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
-#define R_CB_OFF        (3*(MAXJSAMPLE+1))
-#define G_CB_OFF        (4*(MAXJSAMPLE+1))
-#define B_CB_OFF        (5*(MAXJSAMPLE+1))
+#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
+#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
+#define R_CB_OFF        (3 * (MAXJSAMPLE + 1))
+#define G_CB_OFF        (4 * (MAXJSAMPLE + 1))
+#define B_CB_OFF        (5 * (MAXJSAMPLE + 1))
 #define R_CR_OFF        B_CB_OFF                /* B=>Cb, R=>Cr are the same */
-#define G_CR_OFF        (6*(MAXJSAMPLE+1))
-#define B_CR_OFF        (7*(MAXJSAMPLE+1))
-#define TABLE_SIZE      (8*(MAXJSAMPLE+1))
+#define G_CR_OFF        (6 * (MAXJSAMPLE + 1))
+#define B_CR_OFF        (7 * (MAXJSAMPLE + 1))
+#define TABLE_SIZE      (8 * (MAXJSAMPLE + 1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -93,13 +93,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef RGB_BLUE
 #undef RGB_PIXELSIZE
 
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define rgb_ycc_convert_internal extrgb_ycc_convert_internal
-#define rgb_gray_convert_internal extrgb_gray_convert_internal
-#define rgb_rgb_convert_internal extrgb_rgb_convert_internal
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define rgb_ycc_convert_internal  extrgb_ycc_convert_internal
+#define rgb_gray_convert_internal  extrgb_gray_convert_internal
+#define rgb_rgb_convert_internal  extrgb_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -109,13 +109,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define rgb_ycc_convert_internal extrgbx_ycc_convert_internal
-#define rgb_gray_convert_internal extrgbx_gray_convert_internal
-#define rgb_rgb_convert_internal extrgbx_rgb_convert_internal
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define rgb_ycc_convert_internal  extrgbx_ycc_convert_internal
+#define rgb_gray_convert_internal  extrgbx_gray_convert_internal
+#define rgb_rgb_convert_internal  extrgbx_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -125,13 +125,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define rgb_ycc_convert_internal extbgr_ycc_convert_internal
-#define rgb_gray_convert_internal extbgr_gray_convert_internal
-#define rgb_rgb_convert_internal extbgr_rgb_convert_internal
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define rgb_ycc_convert_internal  extbgr_ycc_convert_internal
+#define rgb_gray_convert_internal  extbgr_gray_convert_internal
+#define rgb_rgb_convert_internal  extbgr_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -141,13 +141,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define rgb_ycc_convert_internal extbgrx_ycc_convert_internal
-#define rgb_gray_convert_internal extbgrx_gray_convert_internal
-#define rgb_rgb_convert_internal extbgrx_rgb_convert_internal
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define rgb_ycc_convert_internal  extbgrx_ycc_convert_internal
+#define rgb_gray_convert_internal  extbgrx_gray_convert_internal
+#define rgb_rgb_convert_internal  extbgrx_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -157,13 +157,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define rgb_ycc_convert_internal extxbgr_ycc_convert_internal
-#define rgb_gray_convert_internal extxbgr_gray_convert_internal
-#define rgb_rgb_convert_internal extxbgr_rgb_convert_internal
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define rgb_ycc_convert_internal  extxbgr_ycc_convert_internal
+#define rgb_gray_convert_internal  extxbgr_gray_convert_internal
+#define rgb_rgb_convert_internal  extxbgr_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -173,13 +173,13 @@ typedef my_color_converter *my_cconvert_ptr;
 #undef rgb_gray_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define rgb_ycc_convert_internal extxrgb_ycc_convert_internal
-#define rgb_gray_convert_internal extxrgb_gray_convert_internal
-#define rgb_rgb_convert_internal extxrgb_rgb_convert_internal
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define rgb_ycc_convert_internal  extxrgb_ycc_convert_internal
+#define rgb_gray_convert_internal  extxrgb_gray_convert_internal
+#define rgb_rgb_convert_internal  extxrgb_rgb_convert_internal
 #include "jccolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -195,33 +195,33 @@ typedef my_color_converter *my_cconvert_ptr;
  */
 
 METHODDEF(void)
-rgb_ycc_start (j_compress_ptr cinfo)
+rgb_ycc_start(j_compress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_ycc_tab;
   JLONG i;
 
   /* Allocate and fill in the conversion tables. */
   cconvert->rgb_ycc_tab = rgb_ycc_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
-    rgb_ycc_tab[i+R_Y_OFF] = FIX(0.29900) * i;
-    rgb_ycc_tab[i+G_Y_OFF] = FIX(0.58700) * i;
-    rgb_ycc_tab[i+B_Y_OFF] = FIX(0.11400) * i     + ONE_HALF;
-    rgb_ycc_tab[i+R_CB_OFF] = (-FIX(0.16874)) * i;
-    rgb_ycc_tab[i+G_CB_OFF] = (-FIX(0.33126)) * i;
+    rgb_ycc_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+    rgb_ycc_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+    rgb_ycc_tab[i + B_Y_OFF] = FIX(0.11400) * i   + ONE_HALF;
+    rgb_ycc_tab[i + R_CB_OFF] = (-FIX(0.16874)) * i;
+    rgb_ycc_tab[i + G_CB_OFF] = (-FIX(0.33126)) * i;
     /* We use a rounding fudge-factor of 0.5-epsilon for Cb and Cr.
      * This ensures that the maximum output will round to MAXJSAMPLE
      * not MAXJSAMPLE+1, and thus that we don't have to range-limit.
      */
-    rgb_ycc_tab[i+B_CB_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i + B_CB_OFF] = FIX(0.50000) * i  + CBCR_OFFSET + ONE_HALF - 1;
 /*  B=>Cb and R=>Cr tables are the same
-    rgb_ycc_tab[i+R_CR_OFF] = FIX(0.50000) * i    + CBCR_OFFSET + ONE_HALF-1;
+    rgb_ycc_tab[i + R_CR_OFF] = FIX(0.50000) * i  + CBCR_OFFSET + ONE_HALF - 1;
 */
-    rgb_ycc_tab[i+G_CR_OFF] = (-FIX(0.41869)) * i;
-    rgb_ycc_tab[i+B_CR_OFF] = (-FIX(0.08131)) * i;
+    rgb_ycc_tab[i + G_CR_OFF] = (-FIX(0.41869)) * i;
+    rgb_ycc_tab[i + B_CR_OFF] = (-FIX(0.08131)) * i;
   }
 }
 
@@ -231,43 +231,42 @@ rgb_ycc_start (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-rgb_ycc_convert (j_compress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                 JDIMENSION output_row, int num_rows)
+rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    default:
-      rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  default:
+    rgb_ycc_convert_internal(cinfo, input_buf, output_buf, output_row,
+                             num_rows);
+    break;
   }
 }
 
@@ -280,43 +279,42 @@ rgb_ycc_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-rgb_gray_convert (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                  JDIMENSION output_row, int num_rows)
+rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                 JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                    num_rows);
-      break;
-    default:
-      rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                  num_rows);
+    break;
+  default:
+    rgb_gray_convert_internal(cinfo, input_buf, output_buf, output_row,
+                              num_rows);
+    break;
   }
 }
 
@@ -326,43 +324,42 @@ rgb_gray_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-rgb_rgb_convert (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                  JDIMENSION output_row, int num_rows)
+rgb_rgb_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   switch (cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                                   num_rows);
-      break;
-    default:
-      rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                                 num_rows);
+    break;
+  default:
+    rgb_rgb_convert_internal(cinfo, input_buf, output_buf, output_row,
+                             num_rows);
+    break;
   }
 }
 
@@ -376,11 +373,10 @@ rgb_rgb_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-cmyk_ycck_convert (j_compress_ptr cinfo,
-                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                   JDIMENSION output_row, int num_rows)
+cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_ycc_tab;
   register JSAMPROW inptr;
@@ -396,11 +392,11 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
     outptr3 = output_buf[3][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
-      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
-      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
+      r = MAXJSAMPLE - inptr[0];
+      g = MAXJSAMPLE - inptr[1];
+      b = MAXJSAMPLE - inptr[2];
       /* K passes through as-is */
-      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
+      outptr3[col] = inptr[3];
       inptr += 4;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -408,17 +404,14 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
        * need the general RIGHT_SHIFT macro.
        */
       /* Y */
-      outptr0[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr0[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                                ctab[b + B_Y_OFF]) >> SCALEBITS);
       /* Cb */
-      outptr1[col] = (JSAMPLE)
-                ((ctab[r+R_CB_OFF] + ctab[g+G_CB_OFF] + ctab[b+B_CB_OFF])
-                 >> SCALEBITS);
+      outptr1[col] = (JSAMPLE)((ctab[r + R_CB_OFF] + ctab[g + G_CB_OFF] +
+                                ctab[b + B_CB_OFF]) >> SCALEBITS);
       /* Cr */
-      outptr2[col] = (JSAMPLE)
-                ((ctab[r+R_CR_OFF] + ctab[g+G_CR_OFF] + ctab[b+B_CR_OFF])
-                 >> SCALEBITS);
+      outptr2[col] = (JSAMPLE)((ctab[r + R_CR_OFF] + ctab[g + G_CR_OFF] +
+                                ctab[b + B_CR_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -431,9 +424,8 @@ cmyk_ycck_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-grayscale_convert (j_compress_ptr cinfo,
-                   JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                   JDIMENSION output_row, int num_rows)
+grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                  JSAMPIMAGE output_buf, JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr;
@@ -446,7 +438,7 @@ grayscale_convert (j_compress_ptr cinfo,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr[col] = inptr[0];   /* don't need GETJSAMPLE() here */
+      outptr[col] = inptr[0];
       inptr += instride;
     }
   }
@@ -460,9 +452,8 @@ grayscale_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-null_convert (j_compress_ptr cinfo,
-              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-              JDIMENSION output_row, int num_rows)
+null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+             JDIMENSION output_row, int num_rows)
 {
   register JSAMPROW inptr;
   register JSAMPROW outptr, outptr0, outptr1, outptr2, outptr3;
@@ -506,7 +497,7 @@ null_convert (j_compress_ptr cinfo,
         inptr = *input_buf;
         outptr = output_buf[ci][output_row];
         for (col = 0; col < num_cols; col++) {
-          outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+          outptr[col] = inptr[ci];
           inptr += nc;
         }
       }
@@ -522,7 +513,7 @@ null_convert (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-null_method (j_compress_ptr cinfo)
+null_method(j_compress_ptr cinfo)
 {
   /* no work needed */
 }
@@ -533,14 +524,14 @@ null_method (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_color_converter (j_compress_ptr cinfo)
+jinit_color_converter(j_compress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
 
   cconvert = (my_cconvert_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_converter));
-  cinfo->cconvert = (struct jpeg_color_converter *) cconvert;
+  cinfo->cconvert = (struct jpeg_color_converter *)cconvert;
   /* set start_pass to null method until we find out differently */
   cconvert->pub.start_pass = null_method;
 
diff --git a/media/libjpeg/jcdctmgr.c b/media/libjpeg/jcdctmgr.c
index aef8517f9c..7dae17a6e1 100644
--- a/media/libjpeg/jcdctmgr.c
+++ b/media/libjpeg/jcdctmgr.c
@@ -41,7 +41,7 @@ typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
                                            FAST_FLOAT *divisors,
                                            FAST_FLOAT *workspace);
 
-METHODDEF(void) quantize (JCOEFPTR, DCTELEM *, DCTELEM *);
+METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
 
 typedef struct {
   struct jpeg_forward_dct pub;  /* public fields */
@@ -80,7 +80,7 @@ typedef my_fdct_controller *my_fdct_ptr;
  */
 
 LOCAL(int)
-flss (UINT16 val)
+flss(UINT16 val)
 {
   int bit;
 
@@ -170,7 +170,7 @@ flss (UINT16 val)
  */
 
 LOCAL(int)
-compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
+compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
 {
   UDCTELEM2 fq, fr;
   UDCTELEM c;
@@ -182,10 +182,10 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
      * identity function.  Since only the C quantization algorithm is used in
      * these cases, the scale value is irrelevant.
      */
-    dtbl[DCTSIZE2 * 0] = (DCTELEM) 1;                       /* reciprocal */
-    dtbl[DCTSIZE2 * 1] = (DCTELEM) 0;                       /* correction */
-    dtbl[DCTSIZE2 * 2] = (DCTELEM) 1;                       /* scale */
-    dtbl[DCTSIZE2 * 3] = -(DCTELEM) (sizeof(DCTELEM) * 8);  /* shift */
+    dtbl[DCTSIZE2 * 0] = (DCTELEM)1;                        /* reciprocal */
+    dtbl[DCTSIZE2 * 1] = (DCTELEM)0;                        /* correction */
+    dtbl[DCTSIZE2 * 2] = (DCTELEM)1;                        /* scale */
+    dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8);   /* shift */
     return 0;
   }
 
@@ -195,28 +195,28 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
   fq = ((UDCTELEM2)1 << r) / divisor;
   fr = ((UDCTELEM2)1 << r) % divisor;
 
-  c = divisor / 2; /* for rounding */
+  c = divisor / 2;                      /* for rounding */
 
-  if (fr == 0) { /* divisor is power of two */
+  if (fr == 0) {                        /* divisor is power of two */
     /* fq will be one bit too large to fit in DCTELEM, so adjust */
     fq >>= 1;
     r--;
-  } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */
+  } else if (fr <= (divisor / 2U)) {    /* fractional part is < 0.5 */
     c++;
-  } else { /* fractional part is > 0.5 */
+  } else {                              /* fractional part is > 0.5 */
     fq++;
   }
 
-  dtbl[DCTSIZE2 * 0] = (DCTELEM) fq;      /* reciprocal */
-  dtbl[DCTSIZE2 * 1] = (DCTELEM) c;       /* correction + roundfactor */
+  dtbl[DCTSIZE2 * 0] = (DCTELEM)fq;     /* reciprocal */
+  dtbl[DCTSIZE2 * 1] = (DCTELEM)c;      /* correction + roundfactor */
 #ifdef WITH_SIMD
-  dtbl[DCTSIZE2 * 2] = (DCTELEM) (1 << (sizeof(DCTELEM)*8*2 - r));  /* scale */
+  dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
 #else
   dtbl[DCTSIZE2 * 2] = 1;
 #endif
-  dtbl[DCTSIZE2 * 3] = (DCTELEM) r - sizeof(DCTELEM)*8; /* shift */
+  dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
 
-  if(r <= 16) return 0;
+  if (r <= 16) return 0;
   else return 1;
 }
 
@@ -233,9 +233,9 @@ compute_reciprocal (UINT16 divisor, DCTELEM *dtbl)
  */
 
 METHODDEF(void)
-start_pass_fdctmgr (j_compress_ptr cinfo)
+start_pass_fdctmgr(j_compress_ptr cinfo)
 {
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   int ci, qtblno, i;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtbl;
@@ -259,7 +259,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
        */
       if (fdct->divisors[qtblno] == NULL) {
         fdct->divisors[qtblno] = (DCTELEM *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       (DCTSIZE2 * 4) * sizeof(DCTELEM));
       }
       dtbl = fdct->divisors[qtblno];
@@ -269,7 +269,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
             fdct->quantize == jsimd_quantize)
           fdct->quantize = quantize;
 #else
-        dtbl[i] = ((DCTELEM) qtbl->quantval[i]) << 3;
+        dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
 #endif
       }
       break;
@@ -283,7 +283,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
          * We apply a further scale factor of 8.
          */
-#define CONST_BITS 14
+#define CONST_BITS  14
         static const INT16 aanscales[DCTSIZE2] = {
           /* precomputed values scaled up by 14 bits */
           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
@@ -299,23 +299,23 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 
         if (fdct->divisors[qtblno] == NULL) {
           fdct->divisors[qtblno] = (DCTELEM *)
-            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                         (DCTSIZE2 * 4) * sizeof(DCTELEM));
         }
         dtbl = fdct->divisors[qtblno];
         for (i = 0; i < DCTSIZE2; i++) {
 #if BITS_IN_JSAMPLE == 8
           if (!compute_reciprocal(
-                DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                      (JLONG) aanscales[i]),
-                        CONST_BITS-3), &dtbl[i]) &&
+                DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                      (JLONG)aanscales[i]),
+                        CONST_BITS - 3), &dtbl[i]) &&
               fdct->quantize == jsimd_quantize)
             fdct->quantize = quantize;
 #else
-           dtbl[i] = (DCTELEM)
-             DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                   (JLONG) aanscales[i]),
-                     CONST_BITS-3);
+          dtbl[i] = (DCTELEM)
+            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS - 3);
 #endif
         }
       }
@@ -341,7 +341,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
 
         if (fdct->float_divisors[qtblno] == NULL) {
           fdct->float_divisors[qtblno] = (FAST_FLOAT *)
-            (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                         DCTSIZE2 * sizeof(FAST_FLOAT));
         }
         fdtbl = fdct->float_divisors[qtblno];
@@ -349,7 +349,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
         for (row = 0; row < DCTSIZE; row++) {
           for (col = 0; col < DCTSIZE; col++) {
             fdtbl[i] = (FAST_FLOAT)
-              (1.0 / (((double) qtbl->quantval[i] *
+              (1.0 / (((double)qtbl->quantval[i] *
                        aanscalefactor[row] * aanscalefactor[col] * 8.0)));
             i++;
           }
@@ -370,7 +370,7 @@ start_pass_fdctmgr (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
+convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
 {
   register DCTELEM *workspaceptr;
   register JSAMPROW elemptr;
@@ -381,19 +381,19 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
     elemptr = sample_data[elemr] + start_col;
 
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+        *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
     }
 #endif
   }
@@ -405,7 +405,7 @@ convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
  */
 
 METHODDEF(void)
-quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
 {
   int i;
   DCTELEM temp;
@@ -426,15 +426,15 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
     if (temp < 0) {
       temp = -temp;
       product = (UDCTELEM2)(temp + corr) * recip;
-      product >>= shift + sizeof(DCTELEM)*8;
+      product >>= shift + sizeof(DCTELEM) * 8;
       temp = (DCTELEM)product;
       temp = -temp;
     } else {
       product = (UDCTELEM2)(temp + corr) * recip;
-      product >>= shift + sizeof(DCTELEM)*8;
+      product >>= shift + sizeof(DCTELEM) * 8;
       temp = (DCTELEM)product;
     }
-    output_ptr[i] = (JCOEF) temp;
+    output_ptr[i] = (JCOEF)temp;
   }
 
 #else
@@ -457,20 +457,20 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
      * If your machine's division is fast enough, define FAST_DIVIDE.
      */
 #ifdef FAST_DIVIDE
-#define DIVIDE_BY(a,b)  a /= b
+#define DIVIDE_BY(a, b)  a /= b
 #else
-#define DIVIDE_BY(a,b)  if (a >= b) a /= b; else a = 0
+#define DIVIDE_BY(a, b)  if (a >= b) a /= b;  else a = 0
 #endif
     if (temp < 0) {
       temp = -temp;
-      temp += qval>>1;  /* for rounding */
+      temp += qval >> 1;        /* for rounding */
       DIVIDE_BY(temp, qval);
       temp = -temp;
     } else {
-      temp += qval>>1;  /* for rounding */
+      temp += qval >> 1;        /* for rounding */
       DIVIDE_BY(temp, qval);
     }
-    output_ptr[i] = (JCOEF) temp;
+    output_ptr[i] = (JCOEF)temp;
   }
 
 #endif
@@ -487,14 +487,13 @@ quantize (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
  */
 
 METHODDEF(void)
-forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
-             JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-             JDIMENSION start_row, JDIMENSION start_col,
-             JDIMENSION num_blocks)
+forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
+            JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+            JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
 /* This version is used for integer DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
   DCTELEM *workspace;
   JDIMENSION bi;
@@ -522,9 +521,9 @@ forward_DCT (j_compress_ptr cinfo, jpeg_component_info *compptr,
 
 #ifdef DCT_FLOAT_SUPPORTED
 
-
 METHODDEF(void)
-convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace)
+convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+               FAST_FLOAT *workspace)
 {
   register FAST_FLOAT *workspaceptr;
   register JSAMPROW elemptr;
@@ -534,20 +533,19 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *worksp
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (FAST_FLOAT)
-                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
     }
 #endif
   }
@@ -555,7 +553,8 @@ convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *worksp
 
 
 METHODDEF(void)
-quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace)
+quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+               FAST_FLOAT *workspace)
 {
   register FAST_FLOAT temp;
   register int i;
@@ -571,20 +570,20 @@ quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace
      * The maximum coefficient size is +-16K (for 12-bit data), so this
      * code should work for either 16-bit or 32-bit ints.
      */
-    output_ptr[i] = (JCOEF) ((int) (temp + (FAST_FLOAT) 16384.5) - 16384);
+    output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
   }
 }
 
 
 METHODDEF(void)
-forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                   JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
-                   JDIMENSION start_row, JDIMENSION start_col,
-                   JDIMENSION num_blocks)
+forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                  JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
+                  JDIMENSION start_row, JDIMENSION start_col,
+                  JDIMENSION num_blocks)
 /* This version is used for floating-point DCT implementations. */
 {
   /* This routine is heavily used, so it's worth coding it tightly. */
-  my_fdct_ptr fdct = (my_fdct_ptr) cinfo->fdct;
+  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
   FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
   FAST_FLOAT *workspace;
   JDIMENSION bi;
@@ -618,15 +617,15 @@ forward_DCT_float (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_forward_dct (j_compress_ptr cinfo)
+jinit_forward_dct(j_compress_ptr cinfo)
 {
   my_fdct_ptr fdct;
   int i;
 
   fdct = (my_fdct_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_fdct_controller));
-  cinfo->fdct = (struct jpeg_forward_dct *) fdct;
+  cinfo->fdct = (struct jpeg_forward_dct *)fdct;
   fdct->pub.start_pass = start_pass_fdctmgr;
 
   /* First determine the DCT... */
@@ -703,12 +702,12 @@ jinit_forward_dct (j_compress_ptr cinfo)
 #ifdef DCT_FLOAT_SUPPORTED
   if (cinfo->dct_method == JDCT_FLOAT)
     fdct->float_workspace = (FAST_FLOAT *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(FAST_FLOAT) * DCTSIZE2);
   else
 #endif
     fdct->workspace = (DCTELEM *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(DCTELEM) * DCTSIZE2);
 
   /* Mark divisor tables unallocated */
diff --git a/media/libjpeg/jchuff.c b/media/libjpeg/jchuff.c
index fffaacebce..f4dfa1cb54 100644
--- a/media/libjpeg/jchuff.c
+++ b/media/libjpeg/jchuff.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -16,6 +18,9 @@
  * back up to the start of the current MCU.  To do this, we copy state
  * variables into local working storage, and update them back to the
  * permanent JPEG objects only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -31,32 +36,33 @@
  * memory footprint by 64k, which is important for some mobile applications
  * that create many isolated instances of libjpeg-turbo (web browsers, for
  * instance.)  This may improve performance on some mobile platforms as well.
- * This feature is enabled by default only on ARM processors, because some x86
+ * This feature is enabled by default only on Arm processors, because some x86
  * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
  * shown to have a significant performance impact even on the x86 chips that
- * have a fast implementation of it.  When building for ARMv6, you can
+ * have a fast implementation of it.  When building for Armv6, you can
  * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
  * flags (this defines __thumb__).
  */
 
 /* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
-#if !defined __thumb__ || defined __thumb2__
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
 
 #ifdef USE_CLZ_INTRINSIC
-#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
-#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
+#define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
+#define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
 #include "jpeg_nbits_table.h"
-#define JPEG_NBITS(x) (jpeg_nbits_table[x])
-#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
-#endif
-
-#ifndef min
- #define min(a,b) ((a)<(b)?(a):(b))
+#define JPEG_NBITS(x)          (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)
 #endif
 
 
@@ -66,31 +72,42 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
-typedef struct {
-  size_t put_buffer;            /* current bit-accumulation buffer */
-  int put_bits;                 /* # of bits now in it */
-  int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
-} savable_state;
-
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
 #else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).put_buffer = (src).put_buffer, \
-         (dest).put_bits = (src).put_bits, \
-         (dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
+typedef size_t bit_buf_type;
 #endif
 
+/* NOTE: The more optimal Huffman encoding algorithm is only used by the
+ * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
+ * retain the old Huffman encoder behavior when using the GAS implementation.
+ */
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+                            defined(_M_ARM) || defined(_M_ARM64))
+typedef unsigned long long simd_bit_buf_type;
+#else
+typedef bit_buf_type simd_bit_buf_type;
+#endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+    (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE  64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE  32
+#else
+#error Cannot determine word size
+#endif
+#define SIMD_BIT_BUF_SIZE  (sizeof(simd_bit_buf_type) * 8)
+
+typedef struct {
+  union {
+    bit_buf_type c;
+    simd_bit_buf_type simd;
+  } put_buffer;                         /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+                                        /* (Neon GAS: # of bits now in it) */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
 
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
@@ -124,16 +141,17 @@ typedef struct {
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
 } working_state;
 
 
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_huff (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_huff(j_compress_ptr cinfo);
 #ifdef ENTROPY_OPT_SUPPORTED
-METHODDEF(boolean) encode_mcu_gather (j_compress_ptr cinfo,
-                                      JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
+METHODDEF(boolean) encode_mcu_gather(j_compress_ptr cinfo,
+                                     JBLOCKROW *MCU_data);
+METHODDEF(void) finish_pass_gather(j_compress_ptr cinfo);
 #endif
 
 
@@ -144,9 +162,9 @@ METHODDEF(void) finish_pass_gather (j_compress_ptr cinfo);
  */
 
 METHODDEF(void)
-start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, dctbl, actbl;
   jpeg_component_info *compptr;
 
@@ -180,30 +198,39 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->dc_count_ptrs[dctbl] == NULL)
         entropy->dc_count_ptrs[dctbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
+      memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));
       if (entropy->ac_count_ptrs[actbl] == NULL)
         entropy->ac_count_ptrs[actbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
+      memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));
 #endif
     } else {
       /* Compute derived values for Huffman tables */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, TRUE, dctbl,
-                              & entropy->dc_derived_tbls[dctbl]);
+                              &entropy->dc_derived_tbls[dctbl]);
       jpeg_make_c_derived_tbl(cinfo, FALSE, actbl,
-                              & entropy->ac_derived_tbls[actbl]);
+                              &entropy->ac_derived_tbls[actbl]);
     }
     /* Initialize DC predictions to 0 */
     entropy->saved.last_dc_val[ci] = 0;
   }
 
   /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
+  if (entropy->simd) {
+    entropy->saved.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    entropy->saved.free_bits = 0;
+#else
+    entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    entropy->saved.put_buffer.c = 0;
+    entropy->saved.free_bits = BIT_BUF_SIZE;
+  }
 
   /* Initialize restart stuff */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -219,8 +246,8 @@ start_pass_huff (j_compress_ptr cinfo, boolean gather_statistics)
  */
 
 GLOBAL(void)
-jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
-                         c_derived_tbl **pdtbl)
+jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
+                        c_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   c_derived_tbl *dtbl;
@@ -244,7 +271,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
   /* Allocate a workspace if we haven't already done so. */
   if (*pdtbl == NULL)
     *pdtbl = (c_derived_tbl *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(c_derived_tbl));
   dtbl = *pdtbl;
 
@@ -252,11 +279,11 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 
   p = 0;
   for (l = 1; l <= 16; l++) {
-    i = (int) htbl->bits[l];
+    i = (int)htbl->bits[l];
     if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
-      huffsize[p++] = (char) l;
+      huffsize[p++] = (char)l;
   }
   huffsize[p] = 0;
   lastp = p;
@@ -268,14 +295,14 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
   si = huffsize[0];
   p = 0;
   while (huffsize[p]) {
-    while (((int) huffsize[p]) == si) {
+    while (((int)huffsize[p]) == si) {
       huffcode[p++] = code;
       code++;
     }
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((JLONG) code) >= (((JLONG) 1) << si))
+    if (((JLONG)code) >= (((JLONG)1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -288,7 +315,8 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
    * this lets us detect duplicate VAL entries here, and later
    * allows emit_bits to detect any attempt to emit such symbols.
    */
-  MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
+  memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
+  memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
 
   /* This is also a convenient place to check for out-of-range
    * and duplicated VAL entries.  We allow 0..255 for AC symbols
@@ -310,20 +338,21 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
 /* Outputting bytes to the file */
 
 /* Emit a byte, taking 'action' if must suspend. */
-#define emit_byte(state,val,action)  \
-        { *(state)->next_output_byte++ = (JOCTET) (val);  \
-          if (--(state)->free_in_buffer == 0)  \
-            if (! dump_buffer(state))  \
-              { action; } }
+#define emit_byte(state, val, action) { \
+  *(state)->next_output_byte++ = (JOCTET)(val); \
+  if (--(state)->free_in_buffer == 0) \
+    if (!dump_buffer(state)) \
+      { action; } \
+}
 
 
 LOCAL(boolean)
-dump_buffer (working_state *state)
+dump_buffer(working_state *state)
 /* Empty the output buffer; return TRUE if successful, FALSE if must suspend */
 {
   struct jpeg_destination_mgr *dest = state->cinfo->dest;
 
-  if (! (*dest->empty_output_buffer) (state->cinfo))
+  if (!(*dest->empty_output_buffer) (state->cinfo))
     return FALSE;
   /* After a successful buffer dump, must reset buffer pointers */
   state->next_output_byte = dest->next_output_byte;
@@ -334,90 +363,94 @@ dump_buffer (working_state *state)
 
 /* Outputting bits to the file */
 
-/* These macros perform the same task as the emit_bits() function in the
- * original libjpeg code.  In addition to reducing overhead by explicitly
- * inlining the code, additional performance is achieved by taking into
- * account the size of the bit buffer and waiting until it is almost full
- * before emptying it.  This mostly benefits 64-bit platforms, since 6
- * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+/* Output byte b and, speculatively, an additional 0 byte.  0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
  */
-
-#define EMIT_BYTE() { \
-  JOCTET c; \
-  put_bits -= 8; \
-  c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
-  *buffer++ = c; \
-  if (c == 0xFF)  /* need to stuff a zero byte? */ \
-    *buffer++ = 0; \
- }
-
-#define PUT_BITS(code, size) { \
-  put_bits += size; \
-  put_buffer = (put_buffer << size) | code; \
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
 }
 
-#define CHECKBUF15() { \
-  if (put_bits > 15) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 56); \
+    buffer[1] = (JOCTET)(put_buffer >> 48); \
+    buffer[2] = (JOCTET)(put_buffer >> 40); \
+    buffer[3] = (JOCTET)(put_buffer >> 32); \
+    buffer[4] = (JOCTET)(put_buffer >> 24); \
+    buffer[5] = (JOCTET)(put_buffer >> 16); \
+    buffer[6] = (JOCTET)(put_buffer >> 8); \
+    buffer[7] = (JOCTET)(put_buffer); \
+    buffer += 8; \
   } \
 }
 
-#define CHECKBUF31() { \
-  if (put_bits > 31) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#define CHECKBUF47() { \
-  if (put_bits > 47) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
-
-#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
-#error Cannot determine word size
-#endif
-
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
-
-#define EMIT_BITS(code, size) { \
-  CHECKBUF47() \
-  PUT_BITS(code, size) \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG) 1)<<nbits) - 1; \
-  CHECKBUF31() \
-  PUT_BITS(code, size) \
-  PUT_BITS(temp2, nbits) \
- }
-
 #else
 
-#define EMIT_BITS(code, size) { \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 24); \
+    buffer[1] = (JOCTET)(put_buffer >> 16); \
+    buffer[2] = (JOCTET)(put_buffer >> 8); \
+    buffer[3] = (JOCTET)(put_buffer); \
+    buffer += 4; \
+  } \
 }
 
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG) 1)<<nbits) - 1; \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-  PUT_BITS(temp2, nbits) \
-  CHECKBUF15() \
- }
-
 #endif
 
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size) { \
+  temp &= (((JLONG)1) << nbits) - 1; \
+  temp |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(temp, nbits) \
+}
+
 
 /* Although it is exceedingly rare, it is possible for a Huffman-encoded
  * coefficient block to be larger than the 128-byte unencoded block.  For each
@@ -428,55 +461,81 @@ dump_buffer (working_state *state)
  * scanning order-- 1, 8, 16, etc.), then this will produce an encoded block
  * larger than 200 bytes.
  */
-#define BUFSIZE (DCTSIZE2 * 4)
+#define BUFSIZE  (DCTSIZE2 * 8)
 
 #define LOAD_BUFFER() { \
   if (state->free_in_buffer < BUFSIZE) { \
     localbuf = 1; \
     buffer = _buffer; \
-  } \
-  else buffer = state->next_output_byte; \
- }
+  } else \
+    buffer = state->next_output_byte; \
+}
 
 #define STORE_BUFFER() { \
   if (localbuf) { \
+    size_t bytes, bytestocopy; \
     bytes = buffer - _buffer; \
     buffer = _buffer; \
     while (bytes > 0) { \
-      bytestocopy = min(bytes, state->free_in_buffer); \
-      MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
+      bytestocopy = MIN(bytes, state->free_in_buffer); \
+      memcpy(state->next_output_byte, buffer, bytestocopy); \
       state->next_output_byte += bytestocopy; \
       buffer += bytestocopy; \
       state->free_in_buffer -= bytestocopy; \
       if (state->free_in_buffer == 0) \
-        if (! dump_buffer(state)) return FALSE; \
+        if (!dump_buffer(state)) return FALSE; \
       bytes -= bytestocopy; \
     } \
-  } \
-  else { \
+  } else { \
     state->free_in_buffer -= (buffer - state->next_output_byte); \
     state->next_output_byte = buffer; \
   } \
- }
+}
 
 
 LOCAL(boolean)
-flush_bits (working_state *state)
+flush_bits(working_state *state)
 {
-  JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  JOCTET _buffer[BUFSIZE], *buffer, temp;
+  simd_bit_buf_type put_buffer;  int put_bits;
+  int localbuf = 0;
+
+  if (state->simd) {
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    put_bits = state->cur.free_bits;
+#else
+    put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+    put_buffer = state->cur.put_buffer.simd;
+  } else {
+    put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+    put_buffer = state->cur.put_buffer.c;
+  }
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
   LOAD_BUFFER()
 
-  /* fill any partial byte with ones */
-  PUT_BITS(0x7F, 7)
-  while (put_bits >= 8) EMIT_BYTE()
+  while (put_bits >= 8) {
+    put_bits -= 8;
+    temp = (JOCTET)(put_buffer >> put_bits);
+    EMIT_BYTE(temp)
+  }
+  if (put_bits) {
+    /* fill partial byte with ones */
+    temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+    EMIT_BYTE(temp)
+  }
 
-  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
-  state->cur.put_bits = 0;
+  if (state->simd) {                    /* and reset bit buffer to empty */
+    state->cur.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    state->cur.free_bits = 0;
+#else
+    state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    state->cur.put_buffer.c = 0;
+    state->cur.free_bits = BIT_BUF_SIZE;
+  }
   STORE_BUFFER()
 
   return TRUE;
@@ -486,11 +545,11 @@ flush_bits (working_state *state)
 /* Encode a single block's worth of coefficients */
 
 LOCAL(boolean)
-encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
-                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
+                      c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
   LOAD_BUFFER()
 
@@ -503,108 +562,91 @@ encode_one_block_simd (working_state *state, JCOEFPTR block, int last_dc_val,
 }
 
 LOCAL(boolean)
-encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
-                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
+                 c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  int temp, temp2, temp3;
-  int nbits;
-  int r, code, size;
+  int temp, nbits, free_bits;
+  bit_buf_type put_buffer;
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  free_bits = state->cur.free_bits;
+  put_buffer = state->cur.put_buffer.c;
   LOAD_BUFFER()
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = temp2 = block[0] - last_dc_val;
+  temp = block[0] - last_dc_val;
 
- /* This is a well-known technique for obtaining the absolute value without a
-  * branch.  It is derived from an assembly language technique presented in
-  * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
-  * Agner Fog.
-  */
-  temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-  temp ^= temp3;
-  temp -= temp3;
-
-  /* For a negative input, want temp2 = bitwise complement of abs(input) */
-  /* This code assumes we are on a two's complement machine */
-  temp2 += temp3;
+  /* This is a well-known technique for obtaining the absolute value without a
+   * branch.  It is derived from an assembly language technique presented in
+   * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
+   * Agner Fog.  This code assumes we are on a two's complement machine.
+   */
+  nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+  temp += nbits;
+  nbits ^= temp;
 
   /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = JPEG_NBITS(temp);
+  nbits = JPEG_NBITS(nbits);
 
-  /* Emit the Huffman-coded symbol for the number of bits */
-  code = dctbl->ehufco[nbits];
-  size = dctbl->ehufsi[nbits];
-  EMIT_BITS(code, size)
-
-  /* Mask off any extra bits in code */
-  temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  EMIT_BITS(temp2, nbits)
+  /* Emit the Huffman-coded symbol for the number of bits.
+   * Emit that number of bits of the value, if positive,
+   * or the complement of its magnitude, if negative.
+   */
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
-  r = 0;                        /* r = run length of zeros */
+  {
+    int r = 0;                  /* r = run length of zeros */
 
 /* Manually unroll the k loop to eliminate the counter variable.  This
  * improves performance greatly on systems with a limited number of
  * registers (such as x86.)
  */
-#define kloop(jpeg_natural_order_of_k) {  \
+#define kloop(jpeg_natural_order_of_k) { \
   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
-    r++; \
+    r += 16; \
   } else { \
-    temp2 = temp; \
     /* Branch-less absolute value, bitwise complement, etc., same as above */ \
-    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
-    temp ^= temp3; \
-    temp -= temp3; \
-    temp2 += temp3; \
-    nbits = JPEG_NBITS_NONZERO(temp); \
+    nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp += nbits; \
+    nbits ^= temp; \
+    nbits = JPEG_NBITS_NONZERO(nbits); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
-    while (r > 15) { \
-      EMIT_BITS(code_0xf0, size_0xf0) \
-      r -= 16; \
+    while (r >= 16 * 16) { \
+      r -= 16 * 16; \
+      PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
     } \
     /* Emit Huffman symbol for run length / number of bits */ \
-    temp3 = (r << 4) + nbits;  \
-    code = actbl->ehufco[temp3]; \
-    size = actbl->ehufsi[temp3]; \
-    EMIT_CODE(code, size) \
-    r = 0;  \
+    r += nbits; \
+    PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
+    r = 0; \
   } \
 }
 
-  /* One iteration for each value in jpeg_natural_order[] */
-  kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
-  kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
-  kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
-  kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
-  kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
-  kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
-  kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
-  kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
-  kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
-  kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
-  kloop(55);  kloop(62);  kloop(63);
+    /* One iteration for each value in jpeg_natural_order[] */
+    kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
+    kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
+    kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
+    kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
+    kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
+    kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
+    kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
+    kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
+    kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
+    kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
+    kloop(55);  kloop(62);  kloop(63);
 
-  /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0) {
-    code = actbl->ehufco[0];
-    size = actbl->ehufsi[0];
-    EMIT_BITS(code, size)
+    /* If the last coef(s) were zero, emit an end-of-block code */
+    if (r > 0) {
+      PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+    }
   }
 
-  state->cur.put_buffer = put_buffer;
-  state->cur.put_bits = put_bits;
+  state->cur.put_buffer.c = put_buffer;
+  state->cur.free_bits = free_bits;
   STORE_BUFFER()
 
   return TRUE;
@@ -616,11 +658,11 @@ encode_one_block (working_state *state, JCOEFPTR block, int last_dc_val,
  */
 
 LOCAL(boolean)
-emit_restart (working_state *state, int restart_num)
+emit_restart(working_state *state, int restart_num)
 {
   int ci;
 
-  if (! flush_bits(state))
+  if (!flush_bits(state))
     return FALSE;
 
   emit_byte(state, 0xFF, return FALSE);
@@ -641,9 +683,9 @@ emit_restart (working_state *state, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   working_state state;
   int blkn, ci;
   jpeg_component_info *compptr;
@@ -651,13 +693,14 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! emit_restart(&state, entropy->next_restart_num))
+      if (!emit_restart(&state, entropy->next_restart_num))
         return FALSE;
   }
 
@@ -666,10 +709,10 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
       compptr = cinfo->cur_comp_info[ci];
-      if (! encode_one_block_simd(&state,
-                                  MCU_data[blkn][0], state.cur.last_dc_val[ci],
-                                  entropy->dc_derived_tbls[compptr->dc_tbl_no],
-                                  entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+      if (!encode_one_block_simd(&state,
+                                 MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                                 entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                                 entropy->ac_derived_tbls[compptr->ac_tbl_no]))
         return FALSE;
       /* Update last_dc_val */
       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -678,10 +721,10 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
       ci = cinfo->MCU_membership[blkn];
       compptr = cinfo->cur_comp_info[ci];
-      if (! encode_one_block(&state,
-                             MCU_data[blkn][0], state.cur.last_dc_val[ci],
-                             entropy->dc_derived_tbls[compptr->dc_tbl_no],
-                             entropy->ac_derived_tbls[compptr->ac_tbl_no]))
+      if (!encode_one_block(&state,
+                            MCU_data[blkn][0], state.cur.last_dc_val[ci],
+                            entropy->dc_derived_tbls[compptr->dc_tbl_no],
+                            entropy->ac_derived_tbls[compptr->ac_tbl_no]))
         return FALSE;
       /* Update last_dc_val */
       state.cur.last_dc_val[ci] = MCU_data[blkn][0][0];
@@ -691,7 +734,7 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Completed MCU, so update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 
   /* Update restart-interval state too */
   if (cinfo->restart_interval) {
@@ -712,25 +755,26 @@ encode_mcu_huff (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-finish_pass_huff (j_compress_ptr cinfo)
+finish_pass_huff(j_compress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   working_state state;
 
   /* Load up working state ... flush_bits needs it */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Flush out the last data */
-  if (! flush_bits(&state))
+  if (!flush_bits(&state))
     ERREXIT(cinfo, JERR_CANT_SUSPEND);
 
   /* Update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 }
 
 
@@ -751,8 +795,8 @@ finish_pass_huff (j_compress_ptr cinfo)
 /* Process a single block's worth of coefficients */
 
 LOCAL(void)
-htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
-                 long dc_counts[], long ac_counts[])
+htest_one_block(j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
+                long dc_counts[], long ac_counts[])
 {
   register int temp;
   register int nbits;
@@ -773,7 +817,7 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
   /* Check for out-of-range coefficient values.
    * Since we're encoding a difference, the range limit is twice as much.
    */
-  if (nbits > MAX_COEF_BITS+1)
+  if (nbits > MAX_COEF_BITS + 1)
     ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
   /* Count the Huffman symbol for the number of bits */
@@ -824,9 +868,9 @@ htest_one_block (j_compress_ptr cinfo, JCOEFPTR block, int last_dc_val,
  */
 
 METHODDEF(boolean)
-encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_gather(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int blkn, ci;
   jpeg_component_info *compptr;
 
@@ -863,13 +907,14 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  * one bits (so that padding bits added at the end of a compressed segment
  * can't look like a valid code).  Because of the canonical ordering of
  * codewords, this just means that there must be an unused slot in the
- * longest codeword length category.  Section K.2 of the JPEG spec suggests
- * reserving such a slot by pretending that symbol 256 is a valid symbol
- * with count 1.  In theory that's not optimal; giving it count zero but
- * including it in the symbol set anyway should give a better Huffman code.
- * But the theoretically better code actually seems to come out worse in
- * practice, because it produces more all-ones bytes (which incur stuffed
- * zero bytes in the final file).  In any case the difference is tiny.
+ * longest codeword length category.  Annex K (Clause K.2) of
+ * Rec. ITU-T T.81 (1992) | ISO/IEC 10918-1:1994 suggests reserving such a slot
+ * by pretending that symbol 256 is a valid symbol with count 1.  In theory
+ * that's not optimal; giving it count zero but including it in the symbol set
+ * anyway should give a better Huffman code.  But the theoretically better code
+ * actually seems to come out worse in practice, because it produces more
+ * all-ones bytes (which incur stuffed zero bytes in the final file).  In any
+ * case the difference is tiny.
  *
  * The JPEG standard requires Huffman codes to be no more than 16 bits long.
  * If some symbols have a very small but nonzero probability, the Huffman tree
@@ -884,10 +929,10 @@ encode_mcu_gather (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 GLOBAL(void)
-jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
+jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 {
-#define MAX_CLEN 32             /* assumed maximum initial code length */
-  UINT8 bits[MAX_CLEN+1];       /* bits[k] = # of symbols with code length k */
+#define MAX_CLEN  32            /* assumed maximum initial code length */
+  UINT8 bits[MAX_CLEN + 1];     /* bits[k] = # of symbols with code length k */
   int codesize[257];            /* codesize[k] = code length of symbol k */
   int others[257];              /* next symbol in current branch of tree */
   int c1, c2;
@@ -896,8 +941,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 
   /* This algorithm is explained in section K.2 of the JPEG standard */
 
-  MEMZERO(bits, sizeof(bits));
-  MEMZERO(codesize, sizeof(codesize));
+  memset(bits, 0, sizeof(bits));
+  memset(codesize, 0, sizeof(codesize));
   for (i = 0; i < 257; i++)
     others[i] = -1;             /* init links to empty */
 
@@ -971,13 +1016,13 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
 
   /* JPEG doesn't allow symbols with code lengths over 16 bits, so if the pure
    * Huffman procedure assigned any such lengths, we must adjust the coding.
-   * Here is what the JPEG spec says about how this next bit works:
-   * Since symbols are paired for the longest Huffman code, the symbols are
-   * removed from this length category two at a time.  The prefix for the pair
-   * (which is one bit shorter) is allocated to one of the pair; then,
-   * skipping the BITS entry for that prefix length, a code word from the next
-   * shortest nonzero BITS entry is converted into a prefix for two code words
-   * one bit longer.
+   * Here is what Rec. ITU-T T.81 | ISO/IEC 10918-1 says about how this next
+   * bit works: Since symbols are paired for the longest Huffman code, the
+   * symbols are removed from this length category two at a time.  The prefix
+   * for the pair (which is one bit shorter) is allocated to one of the pair;
+   * then, skipping the BITS entry for that prefix length, a code word from the
+   * next shortest nonzero BITS entry is converted into a prefix for two code
+   * words one bit longer.
    */
 
   for (i = MAX_CLEN; i > 16; i--) {
@@ -987,8 +1032,8 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
         j--;
 
       bits[i] -= 2;             /* remove two symbols */
-      bits[i-1]++;              /* one goes in this length */
-      bits[j+1] += 2;           /* two new symbols in this length */
+      bits[i - 1]++;            /* one goes in this length */
+      bits[j + 1] += 2;         /* two new symbols in this length */
       bits[j]--;                /* symbol of this length is now a prefix */
     }
   }
@@ -999,17 +1044,18 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
   bits[i]--;
 
   /* Return final symbol counts (only for lengths 0..16) */
-  MEMCOPY(htbl->bits, bits, sizeof(htbl->bits));
+  memcpy(htbl->bits, bits, sizeof(htbl->bits));
 
   /* Return a list of the symbols sorted by code length */
   /* It's not real clear to me why we don't need to consider the codelength
-   * changes made above, but the JPEG spec seems to think this works.
+   * changes made above, but Rec. ITU-T T.81 | ISO/IEC 10918-1 seems to think
+   * this works.
    */
   p = 0;
   for (i = 1; i <= MAX_CLEN; i++) {
     for (j = 0; j <= 255; j++) {
       if (codesize[j] == i) {
-        htbl->huffval[p] = (UINT8) j;
+        htbl->huffval[p] = (UINT8)j;
         p++;
       }
     }
@@ -1025,9 +1071,9 @@ jpeg_gen_optimal_table (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
  */
 
 METHODDEF(void)
-finish_pass_gather (j_compress_ptr cinfo)
+finish_pass_gather(j_compress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, dctbl, actbl;
   jpeg_component_info *compptr;
   JHUFF_TBL **htblptr;
@@ -1037,24 +1083,24 @@ finish_pass_gather (j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did_dc, sizeof(did_dc));
-  MEMZERO(did_ac, sizeof(did_ac));
+  memset(did_dc, 0, sizeof(did_dc));
+  memset(did_ac, 0, sizeof(did_ac));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     dctbl = compptr->dc_tbl_no;
     actbl = compptr->ac_tbl_no;
-    if (! did_dc[dctbl]) {
-      htblptr = & cinfo->dc_huff_tbl_ptrs[dctbl];
+    if (!did_dc[dctbl]) {
+      htblptr = &cinfo->dc_huff_tbl_ptrs[dctbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->dc_count_ptrs[dctbl]);
       did_dc[dctbl] = TRUE;
     }
-    if (! did_ac[actbl]) {
-      htblptr = & cinfo->ac_huff_tbl_ptrs[actbl];
+    if (!did_ac[actbl]) {
+      htblptr = &cinfo->ac_huff_tbl_ptrs[actbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->ac_count_ptrs[actbl]);
       did_ac[actbl] = TRUE;
     }
@@ -1070,15 +1116,15 @@ finish_pass_gather (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_huff_encoder (j_compress_ptr cinfo)
+jinit_huff_encoder(j_compress_ptr cinfo)
 {
   huff_entropy_ptr entropy;
   int i;
 
   entropy = (huff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(huff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass_huff;
 
   /* Mark tables unallocated */
diff --git a/media/libjpeg/jchuff.h b/media/libjpeg/jchuff.h
index 4236089adc..314a2325c9 100644
--- a/media/libjpeg/jchuff.h
+++ b/media/libjpeg/jchuff.h
@@ -20,9 +20,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MAX_COEF_BITS 10
+#define MAX_COEF_BITS  10
 #else
-#define MAX_COEF_BITS 14
+#define MAX_COEF_BITS  14
 #endif
 
 /* Derived data constructed for each Huffman table */
@@ -34,10 +34,9 @@ typedef struct {
 } c_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_c_derived_tbl
-        (j_compress_ptr cinfo, boolean isDC, int tblno,
-         c_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC,
+                                     int tblno, c_derived_tbl **pdtbl);
 
 /* Generate an optimal table definition given the specified counts */
-EXTERN(void) jpeg_gen_optimal_table
-        (j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[]);
+EXTERN(void) jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl,
+                                    long freq[]);
diff --git a/media/libjpeg/jcicc.c b/media/libjpeg/jcicc.c
new file mode 100644
index 0000000000..11037ff694
--- /dev/null
+++ b/media/libjpeg/jcicc.c
@@ -0,0 +1,105 @@
+/*
+ * jcicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to write International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to embed the profile data in a JPEG file while writing it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+/*
+ * Since an ICC profile can be larger than the maximum size of a JPEG marker
+ * (64K), we need provisions to split it into multiple markers.  The format
+ * defined by the ICC specifies one or more APP2 markers containing the
+ * following data:
+ *      Identifying string      ASCII "ICC_PROFILE\0"  (12 bytes)
+ *      Marker sequence number  1 for first APP2, 2 for next, etc (1 byte)
+ *      Number of markers       Total number of APP2's used (1 byte)
+ *      Profile data            (remainder of APP2 data)
+ * Decoders should use the marker sequence numbers to reassemble the profile,
+ * rather than assuming that the APP2 markers appear in the correct sequence.
+ */
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+#define MAX_BYTES_IN_MARKER  65533      /* maximum data len of a JPEG marker */
+#define MAX_DATA_BYTES_IN_MARKER  (MAX_BYTES_IN_MARKER - ICC_OVERHEAD_LEN)
+
+
+/*
+ * This routine writes the given ICC profile data into a JPEG file.  It *must*
+ * be called AFTER calling jpeg_start_compress() and BEFORE the first call to
+ * jpeg_write_scanlines().  (This ordering ensures that the APP2 marker(s) will
+ * appear after the SOI and JFIF or Adobe markers, but before all else.)
+ */
+
+GLOBAL(void)
+jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+                       unsigned int icc_data_len)
+{
+  unsigned int num_markers;     /* total number of markers we'll write */
+  int cur_marker = 1;           /* per spec, counting starts at 1 */
+  unsigned int length;          /* number of bytes to write in this marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == 0)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < CSTATE_SCANNING)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  /* Calculate the number of markers we'll need, rounding up of course */
+  num_markers = icc_data_len / MAX_DATA_BYTES_IN_MARKER;
+  if (num_markers * MAX_DATA_BYTES_IN_MARKER != icc_data_len)
+    num_markers++;
+
+  while (icc_data_len > 0) {
+    /* length of profile to put in this marker */
+    length = icc_data_len;
+    if (length > MAX_DATA_BYTES_IN_MARKER)
+      length = MAX_DATA_BYTES_IN_MARKER;
+    icc_data_len -= length;
+
+    /* Write the JPEG marker header (APP2 code and marker length) */
+    jpeg_write_m_header(cinfo, ICC_MARKER,
+                        (unsigned int)(length + ICC_OVERHEAD_LEN));
+
+    /* Write the marker identifying string "ICC_PROFILE" (null-terminated).  We
+     * code it in this less-than-transparent way so that the code works even if
+     * the local character set is not ASCII.
+     */
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x43);
+    jpeg_write_m_byte(cinfo, 0x5F);
+    jpeg_write_m_byte(cinfo, 0x50);
+    jpeg_write_m_byte(cinfo, 0x52);
+    jpeg_write_m_byte(cinfo, 0x4F);
+    jpeg_write_m_byte(cinfo, 0x46);
+    jpeg_write_m_byte(cinfo, 0x49);
+    jpeg_write_m_byte(cinfo, 0x4C);
+    jpeg_write_m_byte(cinfo, 0x45);
+    jpeg_write_m_byte(cinfo, 0x0);
+
+    /* Add the sequencing info */
+    jpeg_write_m_byte(cinfo, cur_marker);
+    jpeg_write_m_byte(cinfo, (int)num_markers);
+
+    /* Add the profile data */
+    while (length--) {
+      jpeg_write_m_byte(cinfo, *icc_data_ptr);
+      icc_data_ptr++;
+    }
+    cur_marker++;
+  }
+}
diff --git a/media/libjpeg/jcinit.c b/media/libjpeg/jcinit.c
index 463bd8c6dd..157353a22e 100644
--- a/media/libjpeg/jcinit.c
+++ b/media/libjpeg/jcinit.c
@@ -1,8 +1,10 @@
 /*
  * jcinit.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -19,6 +21,7 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /*
@@ -28,13 +31,13 @@
  */
 
 GLOBAL(void)
-jinit_compress_master (j_compress_ptr cinfo)
+jinit_compress_master(j_compress_ptr cinfo)
 {
   /* Initialize master control (includes parameter checking/processing) */
   jinit_c_master_control(cinfo, FALSE /* full compression */);
 
   /* Preprocessing */
-  if (! cinfo->raw_data_in) {
+  if (!cinfo->raw_data_in) {
     jinit_color_converter(cinfo);
     jinit_downsampler(cinfo);
     jinit_c_prep_controller(cinfo, FALSE /* never need full buffer here */);
@@ -60,14 +63,14 @@ jinit_compress_master (j_compress_ptr cinfo)
   }
 
   /* Need a full-image coefficient buffer in any multi-pass mode. */
-  jinit_c_coef_controller(cinfo,
-                (boolean) (cinfo->num_scans > 1 || cinfo->optimize_coding));
+  jinit_c_coef_controller(cinfo, (boolean)(cinfo->num_scans > 1 ||
+                                           cinfo->optimize_coding));
   jinit_c_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   jinit_marker_writer(cinfo);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Write the datastream header (SOI) immediately.
    * Frame and scan headers are postponed till later.
diff --git a/media/libjpeg/jcmainct.c b/media/libjpeg/jcmainct.c
index d01f46364b..3f23028c46 100644
--- a/media/libjpeg/jcmainct.c
+++ b/media/libjpeg/jcmainct.c
@@ -39,9 +39,10 @@ typedef my_main_controller *my_main_ptr;
 
 
 /* Forward declarations */
-METHODDEF(void) process_data_simple_main
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-         JDIMENSION in_rows_avail);
+METHODDEF(void) process_data_simple_main(j_compress_ptr cinfo,
+                                         JSAMPARRAY input_buf,
+                                         JDIMENSION *in_row_ctr,
+                                         JDIMENSION in_rows_avail);
 
 
 /*
@@ -49,9 +50,9 @@ METHODDEF(void) process_data_simple_main
  */
 
 METHODDEF(void)
-start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Do nothing in raw-data mode. */
   if (cinfo->raw_data_in)
@@ -75,19 +76,18 @@ start_pass_main (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-process_data_simple_main (j_compress_ptr cinfo,
-                          JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                          JDIMENSION in_rows_avail)
+process_data_simple_main(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                         JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   while (main_ptr->cur_iMCU_row < cinfo->total_iMCU_rows) {
     /* Read input data if we haven't filled the main buffer yet */
     if (main_ptr->rowgroup_ctr < DCTSIZE)
-      (*cinfo->prep->pre_process_data) (cinfo,
-                                        input_buf, in_row_ctr, in_rows_avail,
-                                        main_ptr->buffer, &main_ptr->rowgroup_ctr,
-                                        (JDIMENSION) DCTSIZE);
+      (*cinfo->prep->pre_process_data) (cinfo, input_buf, in_row_ctr,
+                                        in_rows_avail, main_ptr->buffer,
+                                        &main_ptr->rowgroup_ctr,
+                                        (JDIMENSION)DCTSIZE);
 
     /* If we don't have a full iMCU row buffered, return to application for
      * more data.  Note that preprocessor will always pad to fill the iMCU row
@@ -97,14 +97,14 @@ process_data_simple_main (j_compress_ptr cinfo,
       return;
 
     /* Send the completed row to the compressor */
-    if (! (*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
+    if (!(*cinfo->coef->compress_data) (cinfo, main_ptr->buffer)) {
       /* If compressor did not consume the whole row, then we must need to
        * suspend processing and return to the application.  In this situation
        * we pretend we didn't yet consume the last input row; otherwise, if
        * it happened to be the last row of the image, the application would
        * think we were done.
        */
-      if (! main_ptr->suspended) {
+      if (!main_ptr->suspended) {
         (*in_row_ctr)--;
         main_ptr->suspended = TRUE;
       }
@@ -128,16 +128,16 @@ process_data_simple_main (j_compress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_main_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci;
   jpeg_component_info *compptr;
 
   main_ptr = (my_main_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_main_controller));
-  cinfo->main = (struct jpeg_c_main_controller *) main_ptr;
+  cinfo->main = (struct jpeg_c_main_controller *)main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
   /* We don't need to create a buffer in raw-data mode. */
@@ -154,9 +154,9 @@ jinit_c_main_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
          compptr->width_in_blocks * DCTSIZE,
-         (JDIMENSION) (compptr->v_samp_factor * DCTSIZE));
+         (JDIMENSION)(compptr->v_samp_factor * DCTSIZE));
     }
   }
 }
diff --git a/media/libjpeg/jcmarker.c b/media/libjpeg/jcmarker.c
index 463f665927..801fbab4ef 100644
--- a/media/libjpeg/jcmarker.c
+++ b/media/libjpeg/jcmarker.c
@@ -110,30 +110,30 @@ typedef my_marker_writer *my_marker_ptr;
  */
 
 LOCAL(void)
-emit_byte (j_compress_ptr cinfo, int val)
+emit_byte(j_compress_ptr cinfo, int val)
 /* Emit a byte */
 {
   struct jpeg_destination_mgr *dest = cinfo->dest;
 
-  *(dest->next_output_byte)++ = (JOCTET) val;
+  *(dest->next_output_byte)++ = (JOCTET)val;
   if (--dest->free_in_buffer == 0) {
-    if (! (*dest->empty_output_buffer) (cinfo))
+    if (!(*dest->empty_output_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   }
 }
 
 
 LOCAL(void)
-emit_marker (j_compress_ptr cinfo, JPEG_MARKER mark)
+emit_marker(j_compress_ptr cinfo, JPEG_MARKER mark)
 /* Emit a marker code */
 {
   emit_byte(cinfo, 0xFF);
-  emit_byte(cinfo, (int) mark);
+  emit_byte(cinfo, (int)mark);
 }
 
 
 LOCAL(void)
-emit_2bytes (j_compress_ptr cinfo, int value)
+emit_2bytes(j_compress_ptr cinfo, int value)
 /* Emit a 2-byte integer; these are always MSB first in JPEG files */
 {
   emit_byte(cinfo, (value >> 8) & 0xFF);
@@ -146,7 +146,7 @@ emit_2bytes (j_compress_ptr cinfo, int value)
  */
 
 LOCAL(int)
-emit_dqt (j_compress_ptr cinfo, int index)
+emit_dqt(j_compress_ptr cinfo, int index)
 /* Emit a DQT marker */
 /* Returns the precision used (0 = 8bits, 1 = 16bits) for baseline checking */
 {
@@ -163,19 +163,19 @@ emit_dqt (j_compress_ptr cinfo, int index)
       prec = 1;
   }
 
-  if (! qtbl->sent_table) {
+  if (!qtbl->sent_table) {
     emit_marker(cinfo, M_DQT);
 
-    emit_2bytes(cinfo, prec ? DCTSIZE2*2 + 1 + 2 : DCTSIZE2 + 1 + 2);
+    emit_2bytes(cinfo, prec ? DCTSIZE2 * 2 + 1 + 2 : DCTSIZE2 + 1 + 2);
 
-    emit_byte(cinfo, index + (prec<<4));
+    emit_byte(cinfo, index + (prec << 4));
 
     for (i = 0; i < DCTSIZE2; i++) {
       /* The table entries must be emitted in zigzag order. */
       unsigned int qval = qtbl->quantval[jpeg_natural_order[i]];
       if (prec)
-        emit_byte(cinfo, (int) (qval >> 8));
-      emit_byte(cinfo, (int) (qval & 0xFF));
+        emit_byte(cinfo, (int)(qval >> 8));
+      emit_byte(cinfo, (int)(qval & 0xFF));
     }
 
     qtbl->sent_table = TRUE;
@@ -186,7 +186,7 @@ emit_dqt (j_compress_ptr cinfo, int index)
 
 
 LOCAL(void)
-emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
+emit_dht(j_compress_ptr cinfo, int index, boolean is_ac)
 /* Emit a DHT marker */
 {
   JHUFF_TBL *htbl;
@@ -202,7 +202,7 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
   if (htbl == NULL)
     ERREXIT1(cinfo, JERR_NO_HUFF_TABLE, index);
 
-  if (! htbl->sent_table) {
+  if (!htbl->sent_table) {
     emit_marker(cinfo, M_DHT);
 
     length = 0;
@@ -224,7 +224,7 @@ emit_dht (j_compress_ptr cinfo, int index, boolean is_ac)
 
 
 LOCAL(void)
-emit_dac (j_compress_ptr cinfo)
+emit_dac(j_compress_ptr cinfo)
 /* Emit a DAC marker */
 /* Since the useful info is so small, we want to emit all the tables in */
 /* one DAC marker.  Therefore this routine does its own scan of the table. */
@@ -255,12 +255,12 @@ emit_dac (j_compress_ptr cinfo)
   if (length) {
     emit_marker(cinfo, M_DAC);
 
-    emit_2bytes(cinfo, length*2 + 2);
+    emit_2bytes(cinfo, length * 2 + 2);
 
     for (i = 0; i < NUM_ARITH_TBLS; i++) {
       if (dc_in_use[i]) {
         emit_byte(cinfo, i);
-        emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i]<<4));
+        emit_byte(cinfo, cinfo->arith_dc_L[i] + (cinfo->arith_dc_U[i] << 4));
       }
       if (ac_in_use[i]) {
         emit_byte(cinfo, i + 0x10);
@@ -273,19 +273,19 @@ emit_dac (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-emit_dri (j_compress_ptr cinfo)
+emit_dri(j_compress_ptr cinfo)
 /* Emit a DRI marker */
 {
   emit_marker(cinfo, M_DRI);
 
   emit_2bytes(cinfo, 4);        /* fixed length */
 
-  emit_2bytes(cinfo, (int) cinfo->restart_interval);
+  emit_2bytes(cinfo, (int)cinfo->restart_interval);
 }
 
 
 LOCAL(void)
-emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
+emit_sof(j_compress_ptr cinfo, JPEG_MARKER code)
 /* Emit a SOF marker */
 {
   int ci;
@@ -296,13 +296,12 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
   emit_2bytes(cinfo, 3 * cinfo->num_components + 2 + 5 + 1); /* length */
 
   /* Make sure image isn't bigger than SOF field can handle */
-  if ((long) cinfo->_jpeg_height > 65535L ||
-      (long) cinfo->_jpeg_width > 65535L)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) 65535);
+  if ((long)cinfo->_jpeg_height > 65535L || (long)cinfo->_jpeg_width > 65535L)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)65535);
 
   emit_byte(cinfo, cinfo->data_precision);
-  emit_2bytes(cinfo, (int) cinfo->_jpeg_height);
-  emit_2bytes(cinfo, (int) cinfo->_jpeg_width);
+  emit_2bytes(cinfo, (int)cinfo->_jpeg_height);
+  emit_2bytes(cinfo, (int)cinfo->_jpeg_width);
 
   emit_byte(cinfo, cinfo->num_components);
 
@@ -316,7 +315,7 @@ emit_sof (j_compress_ptr cinfo, JPEG_MARKER code)
 
 
 LOCAL(void)
-emit_sos (j_compress_ptr cinfo)
+emit_sos(j_compress_ptr cinfo)
 /* Emit a SOS marker */
 {
   int i, td, ta;
@@ -351,7 +350,7 @@ emit_sos (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-emit_jfif_app0 (j_compress_ptr cinfo)
+emit_jfif_app0(j_compress_ptr cinfo)
 /* Emit a JFIF-compliant APP0 marker */
 {
   /*
@@ -378,15 +377,15 @@ emit_jfif_app0 (j_compress_ptr cinfo)
   emit_byte(cinfo, cinfo->JFIF_major_version); /* Version fields */
   emit_byte(cinfo, cinfo->JFIF_minor_version);
   emit_byte(cinfo, cinfo->density_unit); /* Pixel size information */
-  emit_2bytes(cinfo, (int) cinfo->X_density);
-  emit_2bytes(cinfo, (int) cinfo->Y_density);
+  emit_2bytes(cinfo, (int)cinfo->X_density);
+  emit_2bytes(cinfo, (int)cinfo->Y_density);
   emit_byte(cinfo, 0);          /* No thumbnail image */
   emit_byte(cinfo, 0);
 }
 
 
 LOCAL(void)
-emit_adobe_app14 (j_compress_ptr cinfo)
+emit_adobe_app14(j_compress_ptr cinfo)
 /* Emit an Adobe APP14 marker */
 {
   /*
@@ -440,19 +439,19 @@ emit_adobe_app14 (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_marker_header (j_compress_ptr cinfo, int marker, unsigned int datalen)
+write_marker_header(j_compress_ptr cinfo, int marker, unsigned int datalen)
 /* Emit an arbitrary marker header */
 {
-  if (datalen > (unsigned int) 65533)           /* safety check */
+  if (datalen > (unsigned int)65533)            /* safety check */
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
-  emit_marker(cinfo, (JPEG_MARKER) marker);
+  emit_marker(cinfo, (JPEG_MARKER)marker);
 
-  emit_2bytes(cinfo, (int) (datalen + 2));      /* total length */
+  emit_2bytes(cinfo, (int)(datalen + 2));       /* total length */
 }
 
 METHODDEF(void)
-write_marker_byte (j_compress_ptr cinfo, int val)
+write_marker_byte(j_compress_ptr cinfo, int val)
 /* Emit one byte of marker parameters following write_marker_header */
 {
   emit_byte(cinfo, val);
@@ -471,9 +470,9 @@ write_marker_byte (j_compress_ptr cinfo, int val)
  */
 
 METHODDEF(void)
-write_file_header (j_compress_ptr cinfo)
+write_file_header(j_compress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
   emit_marker(cinfo, M_SOI);    /* first the SOI */
 
@@ -496,7 +495,7 @@ write_file_header (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_frame_header (j_compress_ptr cinfo)
+write_frame_header(j_compress_ptr cinfo)
 {
   int ci, prec;
   boolean is_baseline;
@@ -556,9 +555,9 @@ write_frame_header (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_scan_header (j_compress_ptr cinfo)
+write_scan_header(j_compress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   int i;
   jpeg_component_info *compptr;
 
@@ -600,7 +599,7 @@ write_scan_header (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_file_trailer (j_compress_ptr cinfo)
+write_file_trailer(j_compress_ptr cinfo)
 {
   emit_marker(cinfo, M_EOI);
 }
@@ -614,7 +613,7 @@ write_file_trailer (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-write_tables_only (j_compress_ptr cinfo)
+write_tables_only(j_compress_ptr cinfo)
 {
   int i;
 
@@ -622,10 +621,10 @@ write_tables_only (j_compress_ptr cinfo)
 
   for (i = 0; i < NUM_QUANT_TBLS; i++) {
     if (cinfo->quant_tbl_ptrs[i] != NULL)
-      (void) emit_dqt(cinfo, i);
+      (void)emit_dqt(cinfo, i);
   }
 
-  if (! cinfo->arith_code) {
+  if (!cinfo->arith_code) {
     for (i = 0; i < NUM_HUFF_TBLS; i++) {
       if (cinfo->dc_huff_tbl_ptrs[i] != NULL)
         emit_dht(cinfo, i, FALSE);
@@ -643,15 +642,15 @@ write_tables_only (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_marker_writer (j_compress_ptr cinfo)
+jinit_marker_writer(j_compress_ptr cinfo)
 {
   my_marker_ptr marker;
 
   /* Create the subobject */
   marker = (my_marker_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_marker_writer));
-  cinfo->marker = (struct jpeg_marker_writer *) marker;
+  cinfo->marker = (struct jpeg_marker_writer *)marker;
   /* Initialize method pointers */
   marker->pub.write_file_header = write_file_header;
   marker->pub.write_frame_header = write_frame_header;
diff --git a/media/libjpeg/jcmaster.c b/media/libjpeg/jcmaster.c
index 03a8b40ea9..c2b2600031 100644
--- a/media/libjpeg/jcmaster.c
+++ b/media/libjpeg/jcmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2003-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,9 +25,9 @@
 /* Private state */
 
 typedef enum {
-        main_pass,              /* input data, also do first output step */
-        huff_opt_pass,          /* Huffman code optimization pass */
-        output_pass             /* data output pass */
+  main_pass,                    /* input data, also do first output step */
+  huff_opt_pass,                /* Huffman code optimization pass */
+  output_pass                   /* data output pass */
 } c_pass_type;
 
 typedef struct {
@@ -66,7 +66,7 @@ typedef my_comp_master *my_master_ptr;
  */
 
 GLOBAL(void)
-jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
+jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
   /* Hardwire it to "no scaling" */
@@ -79,7 +79,7 @@ jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-initial_setup (j_compress_ptr cinfo, boolean transcode_only)
+initial_setup(j_compress_ptr cinfo, boolean transcode_only)
 /* Do computations that are needed before master selection phase */
 {
   int ci;
@@ -95,19 +95,19 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 #endif
 
   /* Sanity check on image dimensions */
-  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0
-      || cinfo->num_components <= 0 || cinfo->input_components <= 0)
+  if (cinfo->_jpeg_height <= 0 || cinfo->_jpeg_width <= 0 ||
+      cinfo->num_components <= 0 || cinfo->input_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->_jpeg_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->_jpeg_width > (long) JPEG_MAX_DIMENSION)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+  if ((long)cinfo->_jpeg_height > (long)JPEG_MAX_DIMENSION ||
+      (long)cinfo->_jpeg_width > (long)JPEG_MAX_DIMENSION)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
 
   /* Width of an input scanline must be representable as JDIMENSION. */
-  samplesperrow = (long) cinfo->image_width * (long) cinfo->input_components;
-  jd_samplesperrow = (JDIMENSION) samplesperrow;
-  if ((long) jd_samplesperrow != samplesperrow)
+  samplesperrow = (long)cinfo->image_width * (long)cinfo->input_components;
+  jd_samplesperrow = (JDIMENSION)samplesperrow;
+  if ((long)jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
   /* For now, precision must match compiled-in value... */
@@ -124,8 +124,10 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
   cinfo->max_v_samp_factor = 1;
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+    if (compptr->h_samp_factor <= 0 ||
+        compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+        compptr->v_samp_factor <= 0 ||
+        compptr->v_samp_factor > MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
                                    compptr->h_samp_factor);
@@ -146,18 +148,18 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
 #endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width * (long) compptr->h_samp_factor,
-                    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long)cinfo->_jpeg_width * (long)compptr->h_samp_factor,
+                    (long)cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height * (long) compptr->v_samp_factor,
-                    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long)cinfo->_jpeg_height * (long)compptr->v_samp_factor,
+                    (long)cinfo->max_v_samp_factor);
     /* Mark component needed (this flag isn't actually used for compression) */
     compptr->component_needed = TRUE;
   }
@@ -166,15 +168,15 @@ initial_setup (j_compress_ptr cinfo, boolean transcode_only)
    * main controller will call coefficient controller).
    */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->_jpeg_height,
-                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long)cinfo->_jpeg_height,
+                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
 }
 
 
 #ifdef C_MULTISCAN_FILES_SUPPORTED
 
 LOCAL(void)
-validate_script (j_compress_ptr cinfo)
+validate_script(j_compress_ptr cinfo)
 /* Verify that the scan script in cinfo->scan_info[] is valid; also
  * determine whether it uses progressive JPEG, and set cinfo->progressive_mode.
  */
@@ -196,10 +198,10 @@ validate_script (j_compress_ptr cinfo)
    * for progressive JPEG, no scan can have this.
    */
   scanptr = cinfo->scan_info;
-  if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2-1) {
+  if (scanptr->Ss != 0 || scanptr->Se != DCTSIZE2 - 1) {
 #ifdef C_PROGRESSIVE_SUPPORTED
     cinfo->progressive_mode = TRUE;
-    last_bitpos_ptr = & last_bitpos[0][0];
+    last_bitpos_ptr = &last_bitpos[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
       for (coefi = 0; coefi < DCTSIZE2; coefi++)
         *last_bitpos_ptr++ = -1;
@@ -222,7 +224,7 @@ validate_script (j_compress_ptr cinfo)
       if (thisi < 0 || thisi >= cinfo->num_components)
         ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
       /* Components must appear in SOF order within each scan */
-      if (ci > 0 && thisi <= scanptr->component_index[ci-1])
+      if (ci > 0 && thisi <= scanptr->component_index[ci - 1])
         ERREXIT1(cinfo, JERR_BAD_SCAN_SCRIPT, scanno);
     }
     /* Validate progression parameters */
@@ -232,17 +234,17 @@ validate_script (j_compress_ptr cinfo)
     Al = scanptr->Al;
     if (cinfo->progressive_mode) {
 #ifdef C_PROGRESSIVE_SUPPORTED
-      /* The JPEG spec simply gives the ranges 0..13 for Ah and Al, but that
-       * seems wrong: the upper bound ought to depend on data precision.
-       * Perhaps they really meant 0..N+1 for N-bit precision.
+      /* Rec. ITU-T T.81 | ISO/IEC 10918-1 simply gives the ranges 0..13 for Ah
+       * and Al, but that seems wrong: the upper bound ought to depend on data
+       * precision.  Perhaps they really meant 0..N+1 for N-bit precision.
        * Here we allow 0..10 for 8-bit data; Al larger than 10 results in
        * out-of-range reconstructed DC values during the first DC scan,
        * which might cause problems for some decoders.
        */
 #if BITS_IN_JSAMPLE == 8
-#define MAX_AH_AL 10
+#define MAX_AH_AL  10
 #else
-#define MAX_AH_AL 13
+#define MAX_AH_AL  13
 #endif
       if (Ss < 0 || Ss >= DCTSIZE2 || Se < Ss || Se >= DCTSIZE2 ||
           Ah < 0 || Ah > MAX_AH_AL || Al < 0 || Al > MAX_AH_AL)
@@ -255,7 +257,7 @@ validate_script (j_compress_ptr cinfo)
           ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       }
       for (ci = 0; ci < ncomps; ci++) {
-        last_bitpos_ptr = & last_bitpos[scanptr->component_index[ci]][0];
+        last_bitpos_ptr = &last_bitpos[scanptr->component_index[ci]][0];
         if (Ss != 0 && last_bitpos_ptr[0] < 0) /* AC without prior DC scan */
           ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
         for (coefi = Ss; coefi <= Se; coefi++) {
@@ -265,7 +267,7 @@ validate_script (j_compress_ptr cinfo)
               ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
           } else {
             /* not first scan */
-            if (Ah != last_bitpos_ptr[coefi] || Al != Ah-1)
+            if (Ah != last_bitpos_ptr[coefi] || Al != Ah - 1)
               ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
           }
           last_bitpos_ptr[coefi] = Al;
@@ -274,7 +276,7 @@ validate_script (j_compress_ptr cinfo)
 #endif
     } else {
       /* For sequential JPEG, all progression parameters must be these: */
-      if (Ss != 0 || Se != DCTSIZE2-1 || Ah != 0 || Al != 0)
+      if (Ss != 0 || Se != DCTSIZE2 - 1 || Ah != 0 || Al != 0)
         ERREXIT1(cinfo, JERR_BAD_PROG_SCRIPT, scanno);
       /* Make sure components are not sent twice */
       for (ci = 0; ci < ncomps; ci++) {
@@ -301,7 +303,7 @@ validate_script (j_compress_ptr cinfo)
 #endif
   } else {
     for (ci = 0; ci < cinfo->num_components; ci++) {
-      if (! component_sent[ci])
+      if (!component_sent[ci])
         ERREXIT(cinfo, JERR_MISSING_DATA);
     }
   }
@@ -311,7 +313,7 @@ validate_script (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-select_scan_parameters (j_compress_ptr cinfo)
+select_scan_parameters(j_compress_ptr cinfo)
 /* Set up the scan parameters for the current scan */
 {
   int ci;
@@ -319,7 +321,7 @@ select_scan_parameters (j_compress_ptr cinfo)
 #ifdef C_MULTISCAN_FILES_SUPPORTED
   if (cinfo->scan_info != NULL) {
     /* Prepare for current scan --- the script is already validated */
-    my_master_ptr master = (my_master_ptr) cinfo->master;
+    my_master_ptr master = (my_master_ptr)cinfo->master;
     const jpeg_scan_info *scanptr = cinfo->scan_info + master->scan_number;
 
     cinfo->comps_in_scan = scanptr->comps_in_scan;
@@ -331,8 +333,7 @@ select_scan_parameters (j_compress_ptr cinfo)
     cinfo->Se = scanptr->Se;
     cinfo->Ah = scanptr->Ah;
     cinfo->Al = scanptr->Al;
-  }
-  else
+  } else
 #endif
   {
     /* Prepare for single sequential-JPEG scan containing all components */
@@ -344,7 +345,7 @@ select_scan_parameters (j_compress_ptr cinfo)
       cinfo->cur_comp_info[ci] = &cinfo->comp_info[ci];
     }
     cinfo->Ss = 0;
-    cinfo->Se = DCTSIZE2-1;
+    cinfo->Se = DCTSIZE2 - 1;
     cinfo->Ah = 0;
     cinfo->Al = 0;
   }
@@ -352,7 +353,7 @@ select_scan_parameters (j_compress_ptr cinfo)
 
 
 LOCAL(void)
-per_scan_setup (j_compress_ptr cinfo)
+per_scan_setup(j_compress_ptr cinfo)
 /* Do computations that are needed before processing a JPEG scan */
 /* cinfo->comps_in_scan and cinfo->cur_comp_info[] are already set */
 {
@@ -377,7 +378,7 @@ per_scan_setup (j_compress_ptr cinfo)
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
      */
-    tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+    tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
 
@@ -394,11 +395,11 @@ per_scan_setup (j_compress_ptr cinfo)
 
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_width,
-                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_width,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->_jpeg_height,
-                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->_jpeg_height,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
     cinfo->blocks_in_MCU = 0;
 
@@ -410,10 +411,10 @@ per_scan_setup (j_compress_ptr cinfo)
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
       compptr->MCU_sample_width = compptr->MCU_width * DCTSIZE;
       /* Figure number of non-dummy blocks in last MCU column & row */
-      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+      tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
       compptr->last_col_width = tmp;
-      tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+      tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
       if (tmp == 0) tmp = compptr->MCU_height;
       compptr->last_row_height = tmp;
       /* Prepare array describing MCU composition */
@@ -430,8 +431,8 @@ per_scan_setup (j_compress_ptr cinfo)
   /* Convert restart specified in rows to actual MCU count. */
   /* Note that count must fit in 16 bits, so we provide limiting. */
   if (cinfo->restart_in_rows > 0) {
-    long nominal = (long) cinfo->restart_in_rows * (long) cinfo->MCUs_per_row;
-    cinfo->restart_interval = (unsigned int) MIN(nominal, 65535L);
+    long nominal = (long)cinfo->restart_in_rows * (long)cinfo->MCUs_per_row;
+    cinfo->restart_interval = (unsigned int)MIN(nominal, 65535L);
   }
 }
 
@@ -445,9 +446,9 @@ per_scan_setup (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-prepare_for_pass (j_compress_ptr cinfo)
+prepare_for_pass(j_compress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   switch (master->pass_type) {
   case main_pass:
@@ -456,7 +457,7 @@ prepare_for_pass (j_compress_ptr cinfo)
      */
     select_scan_parameters(cinfo);
     per_scan_setup(cinfo);
-    if (! cinfo->raw_data_in) {
+    if (!cinfo->raw_data_in) {
       (*cinfo->cconvert->start_pass) (cinfo);
       (*cinfo->downsample->start_pass) (cinfo);
       (*cinfo->prep->start_pass) (cinfo, JBUF_PASS_THRU);
@@ -491,12 +492,12 @@ prepare_for_pass (j_compress_ptr cinfo)
      */
     master->pass_type = output_pass;
     master->pass_number++;
-    /*FALLTHROUGH*/
 #endif
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case output_pass:
     /* Do a data-output pass. */
     /* We need not repeat per-scan setup if prior optimization pass did it. */
-    if (! cinfo->optimize_coding) {
+    if (!cinfo->optimize_coding) {
       select_scan_parameters(cinfo);
       per_scan_setup(cinfo);
     }
@@ -512,7 +513,7 @@ prepare_for_pass (j_compress_ptr cinfo)
     ERREXIT(cinfo, JERR_NOT_COMPILED);
   }
 
-  master->pub.is_last_pass = (master->pass_number == master->total_passes-1);
+  master->pub.is_last_pass = (master->pass_number == master->total_passes - 1);
 
   /* Set up progress monitor's pass info if present */
   if (cinfo->progress != NULL) {
@@ -533,7 +534,7 @@ prepare_for_pass (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-pass_startup (j_compress_ptr cinfo)
+pass_startup(j_compress_ptr cinfo)
 {
   cinfo->master->call_pass_startup = FALSE; /* reset flag so call only once */
 
@@ -547,9 +548,9 @@ pass_startup (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass_master (j_compress_ptr cinfo)
+finish_pass_master(j_compress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   /* The entropy coder always needs an end-of-pass call,
    * either to analyze statistics or to flush its output buffer.
@@ -563,7 +564,7 @@ finish_pass_master (j_compress_ptr cinfo)
      * or output of scan 1 (if no optimization).
      */
     master->pass_type = output_pass;
-    if (! cinfo->optimize_coding)
+    if (!cinfo->optimize_coding)
       master->scan_number++;
     break;
   case huff_opt_pass:
@@ -587,14 +588,14 @@ finish_pass_master (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_c_master_control (j_compress_ptr cinfo, boolean transcode_only)
+jinit_c_master_control(j_compress_ptr cinfo, boolean transcode_only)
 {
   my_master_ptr master;
 
   master = (my_master_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  sizeof(my_comp_master));
-  cinfo->master = (struct jpeg_comp_master *) master;
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(my_comp_master));
+  cinfo->master = (struct jpeg_comp_master *)master;
   master->pub.prepare_for_pass = prepare_for_pass;
   master->pub.pass_startup = pass_startup;
   master->pub.finish_pass = finish_pass_master;
diff --git a/media/libjpeg/jcomapi.c b/media/libjpeg/jcomapi.c
index 6e5bf3dba9..efbb8357b0 100644
--- a/media/libjpeg/jcomapi.c
+++ b/media/libjpeg/jcomapi.c
@@ -29,7 +29,7 @@
  */
 
 GLOBAL(void)
-jpeg_abort (j_common_ptr cinfo)
+jpeg_abort(j_common_ptr cinfo)
 {
   int pool;
 
@@ -40,7 +40,7 @@ jpeg_abort (j_common_ptr cinfo)
   /* Releasing pools in reverse order might help avoid fragmentation
    * with some (brain-damaged) malloc libraries.
    */
-  for (pool = JPOOL_NUMPOOLS-1; pool > JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool > JPOOL_PERMANENT; pool--) {
     (*cinfo->mem->free_pool) (cinfo, pool);
   }
 
@@ -50,7 +50,7 @@ jpeg_abort (j_common_ptr cinfo)
     /* Try to keep application from accessing now-deleted marker list.
      * A bit kludgy to do it here, but this is the most central place.
      */
-    ((j_decompress_ptr) cinfo)->marker_list = NULL;
+    ((j_decompress_ptr)cinfo)->marker_list = NULL;
   } else {
     cinfo->global_state = CSTATE_START;
   }
@@ -69,7 +69,7 @@ jpeg_abort (j_common_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_destroy (j_common_ptr cinfo)
+jpeg_destroy(j_common_ptr cinfo)
 {
   /* We need only tell the memory manager to release everything. */
   /* NB: mem pointer is NULL if memory mgr failed to initialize. */
@@ -86,7 +86,7 @@ jpeg_destroy (j_common_ptr cinfo)
  */
 
 GLOBAL(JQUANT_TBL *)
-jpeg_alloc_quant_table (j_common_ptr cinfo)
+jpeg_alloc_quant_table(j_common_ptr cinfo)
 {
   JQUANT_TBL *tbl;
 
@@ -98,7 +98,7 @@ jpeg_alloc_quant_table (j_common_ptr cinfo)
 
 
 GLOBAL(JHUFF_TBL *)
-jpeg_alloc_huff_table (j_common_ptr cinfo)
+jpeg_alloc_huff_table(j_common_ptr cinfo)
 {
   JHUFF_TBL *tbl;
 
diff --git a/media/libjpeg/jconfigint.h b/media/libjpeg/jconfigint.h
index 1289399f94..02be97857f 100644
--- a/media/libjpeg/jconfigint.h
+++ b/media/libjpeg/jconfigint.h
@@ -1,7 +1,47 @@
-#define VERSION "1.5.1"
-#define BUILD "2016-09-20"
-#define PACKAGE_NAME "libjpeg-turbo"
+/* libjpeg-turbo build number */
+#define BUILD "20220624"
 
 /* Need to use Mozilla-specific function inlining. */
 #include "mozilla/Attributes.h"
 #define INLINE MOZ_ALWAYS_INLINE
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "libjpeg-turbo"
+
+/* Version number of package */
+#define VERSION "2.1.3"
+
+/* The size of `size_t', as computed by sizeof. */
+#ifdef HAVE_64BIT_BUILD
+#define SIZEOF_SIZE_T 8
+#else
+#define SIZEOF_SIZE_T 4
+#endif
+
+/* Define if your compiler has __builtin_ctzl() and sizeof(unsigned long) == sizeof(size_t). */
+#ifndef _MSC_VER
+#define HAVE_BUILTIN_CTZL 1
+#endif
+
+/* Define to 1 if you have the <intrin.h> header file. */
+#ifdef _MSC_VER
+#define HAVE_INTRIN_H 1
+#endif
+
+#if defined(_MSC_VER) && defined(HAVE_INTRIN_H)
+#if (SIZEOF_SIZE_T == 8)
+#define HAVE_BITSCANFORWARD64
+#elif (SIZEOF_SIZE_T == 4)
+#define HAVE_BITSCANFORWARD
+#endif
+#endif
+
+#if defined(__has_attribute)
+#if __has_attribute(fallthrough)
+#define FALLTHROUGH  __attribute__((fallthrough));
+#else
+#define FALLTHROUGH
+#endif
+#else
+#define FALLTHROUGH
+#endif
diff --git a/media/libjpeg/jcparam.c b/media/libjpeg/jcparam.c
index 18b2d487ae..5bc7174dcb 100644
--- a/media/libjpeg/jcparam.c
+++ b/media/libjpeg/jcparam.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2008 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, D. R. Commander.
+ * Copyright (C) 2009-2011, 2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,9 +25,9 @@
  */
 
 GLOBAL(void)
-jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-                      const unsigned int *basic_table,
-                      int scale_factor, boolean force_baseline)
+jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                     const unsigned int *basic_table, int scale_factor,
+                     boolean force_baseline)
 /* Define a quantization table equal to the basic_table times
  * a scale factor (given as a percentage).
  * If force_baseline is TRUE, the computed quantization table entries
@@ -45,19 +45,19 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
   if (which_tbl < 0 || which_tbl >= NUM_QUANT_TBLS)
     ERREXIT1(cinfo, JERR_DQT_INDEX, which_tbl);
 
-  qtblptr = & cinfo->quant_tbl_ptrs[which_tbl];
+  qtblptr = &cinfo->quant_tbl_ptrs[which_tbl];
 
   if (*qtblptr == NULL)
-    *qtblptr = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+    *qtblptr = jpeg_alloc_quant_table((j_common_ptr)cinfo);
 
   for (i = 0; i < DCTSIZE2; i++) {
-    temp = ((long) basic_table[i] * scale_factor + 50L) / 100L;
+    temp = ((long)basic_table[i] * scale_factor + 50L) / 100L;
     /* limit the values to the valid range */
     if (temp <= 0L) temp = 1L;
     if (temp > 32767L) temp = 32767L; /* max quantizer needed for 12 bits */
     if (force_baseline && temp > 255L)
       temp = 255L;              /* limit to baseline range if requested */
-    (*qtblptr)->quantval[i] = (UINT16) temp;
+    (*qtblptr)->quantval[i] = (UINT16)temp;
   }
 
   /* Initialize sent_table FALSE so table will be written to JPEG file. */
@@ -65,7 +65,8 @@ jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
 }
 
 
-/* These are the sample quantization tables given in JPEG spec section K.1.
+/* These are the sample quantization tables given in Annex K (Clause K.1) of
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  * The spec says that the values given produce "good" quality, and
  * when divided by 2, "very good" quality.
  */
@@ -93,7 +94,7 @@ static const unsigned int std_chrominance_quant_tbl[DCTSIZE2] = {
 
 #if JPEG_LIB_VERSION >= 70
 GLOBAL(void)
-jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables
  * and straight percentage-scaling quality scales.
  * This entry point allows different scalings for luminance and chrominance.
@@ -109,8 +110,8 @@ jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
 
 
 GLOBAL(void)
-jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-                         boolean force_baseline)
+jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                        boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables
  * and a straight percentage-scaling quality scale.  In most cases it's better
  * to use jpeg_set_quality (below); this entry point is provided for
@@ -126,7 +127,7 @@ jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
 
 
 GLOBAL(int)
-jpeg_quality_scaling (int quality)
+jpeg_quality_scaling(int quality)
 /* Convert a user-specified quality rating to a percentage scaling factor
  * for an underlying quantization table, using our recommended scaling curve.
  * The input 'quality' factor should be 0 (terrible) to 100 (very good).
@@ -145,14 +146,14 @@ jpeg_quality_scaling (int quality)
   if (quality < 50)
     quality = 5000 / quality;
   else
-    quality = 200 - quality*2;
+    quality = 200 - quality * 2;
 
   return quality;
 }
 
 
 GLOBAL(void)
-jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+jpeg_set_quality(j_compress_ptr cinfo, int quality, boolean force_baseline)
 /* Set or change the 'quality' (quantization) setting, using default tables.
  * This is the standard quality-adjusting entry point for typical user
  * interfaces; only those who want detailed control over quantization tables
@@ -178,7 +179,7 @@ jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
  */
 
 GLOBAL(void)
-jpeg_set_defaults (j_compress_ptr cinfo)
+jpeg_set_defaults(j_compress_ptr cinfo)
 {
   int i;
 
@@ -192,7 +193,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
    */
   if (cinfo->comp_info == NULL)
     cinfo->comp_info = (jpeg_component_info *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   MAX_COMPONENTS * sizeof(jpeg_component_info));
 
   /* Initialize everything not dependent on the color space */
@@ -205,7 +206,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
   /* Set up two quantization tables using default quality of 75 */
   jpeg_set_quality(cinfo, 75, TRUE);
   /* Set up two Huffman tables */
-  std_huff_tables((j_common_ptr) cinfo);
+  std_huff_tables((j_common_ptr)cinfo);
 
   /* Initialize default arithmetic coding conditioning */
   for (i = 0; i < NUM_ARITH_TBLS; i++) {
@@ -278,7 +279,7 @@ jpeg_set_defaults (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_default_colorspace (j_compress_ptr cinfo)
+jpeg_default_colorspace(j_compress_ptr cinfo)
 {
   switch (cinfo->in_color_space) {
   case JCS_GRAYSCALE:
@@ -320,12 +321,12 @@ jpeg_default_colorspace (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
 {
   jpeg_component_info *compptr;
   int ci;
 
-#define SET_COMP(index,id,hsamp,vsamp,quant,dctbl,actbl)  \
+#define SET_COMP(index, id, hsamp, vsamp, quant, dctbl, actbl) \
   (compptr = &cinfo->comp_info[index], \
    compptr->component_id = (id), \
    compptr->h_samp_factor = (hsamp), \
@@ -352,39 +353,39 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
     cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
     cinfo->num_components = 1;
     /* JFIF specifies component ID 1 */
-    SET_COMP(0, 1, 1,1, 0, 0,0);
+    SET_COMP(0, 1, 1, 1, 0, 0, 0);
     break;
   case JCS_RGB:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag RGB */
     cinfo->num_components = 3;
-    SET_COMP(0, 0x52 /* 'R' */, 1,1, 0, 0,0);
-    SET_COMP(1, 0x47 /* 'G' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x42 /* 'B' */, 1,1, 0, 0,0);
+    SET_COMP(0, 0x52 /* 'R' */, 1, 1, 0, 0, 0);
+    SET_COMP(1, 0x47 /* 'G' */, 1, 1, 0, 0, 0);
+    SET_COMP(2, 0x42 /* 'B' */, 1, 1, 0, 0, 0);
     break;
   case JCS_YCbCr:
     cinfo->write_JFIF_header = TRUE; /* Write a JFIF marker */
     cinfo->num_components = 3;
     /* JFIF specifies component IDs 1,2,3 */
     /* We default to 2x2 subsamples of chrominance */
-    SET_COMP(0, 1, 2,2, 0, 0,0);
-    SET_COMP(1, 2, 1,1, 1, 1,1);
-    SET_COMP(2, 3, 1,1, 1, 1,1);
+    SET_COMP(0, 1, 2, 2, 0, 0, 0);
+    SET_COMP(1, 2, 1, 1, 1, 1, 1);
+    SET_COMP(2, 3, 1, 1, 1, 1, 1);
     break;
   case JCS_CMYK:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag CMYK */
     cinfo->num_components = 4;
-    SET_COMP(0, 0x43 /* 'C' */, 1,1, 0, 0,0);
-    SET_COMP(1, 0x4D /* 'M' */, 1,1, 0, 0,0);
-    SET_COMP(2, 0x59 /* 'Y' */, 1,1, 0, 0,0);
-    SET_COMP(3, 0x4B /* 'K' */, 1,1, 0, 0,0);
+    SET_COMP(0, 0x43 /* 'C' */, 1, 1, 0, 0, 0);
+    SET_COMP(1, 0x4D /* 'M' */, 1, 1, 0, 0, 0);
+    SET_COMP(2, 0x59 /* 'Y' */, 1, 1, 0, 0, 0);
+    SET_COMP(3, 0x4B /* 'K' */, 1, 1, 0, 0, 0);
     break;
   case JCS_YCCK:
     cinfo->write_Adobe_marker = TRUE; /* write Adobe marker to flag YCCK */
     cinfo->num_components = 4;
-    SET_COMP(0, 1, 2,2, 0, 0,0);
-    SET_COMP(1, 2, 1,1, 1, 1,1);
-    SET_COMP(2, 3, 1,1, 1, 1,1);
-    SET_COMP(3, 4, 2,2, 0, 0,0);
+    SET_COMP(0, 1, 2, 2, 0, 0, 0);
+    SET_COMP(1, 2, 1, 1, 1, 1, 1);
+    SET_COMP(2, 3, 1, 1, 1, 1, 1);
+    SET_COMP(3, 4, 2, 2, 0, 0, 0);
     break;
   case JCS_UNKNOWN:
     cinfo->num_components = cinfo->input_components;
@@ -392,7 +393,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
       ERREXIT2(cinfo, JERR_COMPONENT_COUNT, cinfo->num_components,
                MAX_COMPONENTS);
     for (ci = 0; ci < cinfo->num_components; ci++) {
-      SET_COMP(ci, ci, 1,1, 0, 0,0);
+      SET_COMP(ci, ci, 1, 1, 0, 0, 0);
     }
     break;
   default:
@@ -404,8 +405,7 @@ jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
 #ifdef C_PROGRESSIVE_SUPPORTED
 
 LOCAL(jpeg_scan_info *)
-fill_a_scan (jpeg_scan_info *scanptr, int ci,
-             int Ss, int Se, int Ah, int Al)
+fill_a_scan(jpeg_scan_info *scanptr, int ci, int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for specified component */
 {
   scanptr->comps_in_scan = 1;
@@ -419,8 +419,7 @@ fill_a_scan (jpeg_scan_info *scanptr, int ci,
 }
 
 LOCAL(jpeg_scan_info *)
-fill_scans (jpeg_scan_info *scanptr, int ncomps,
-            int Ss, int Se, int Ah, int Al)
+fill_scans(jpeg_scan_info *scanptr, int ncomps, int Ss, int Se, int Ah, int Al)
 /* Support routine: generate one scan for each component */
 {
   int ci;
@@ -438,7 +437,7 @@ fill_scans (jpeg_scan_info *scanptr, int ncomps,
 }
 
 LOCAL(jpeg_scan_info *)
-fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
+fill_dc_scans(jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
 /* Support routine: generate interleaved DC scan if possible, else N scans */
 {
   int ci;
@@ -466,7 +465,7 @@ fill_dc_scans (jpeg_scan_info *scanptr, int ncomps, int Ah, int Al)
  */
 
 GLOBAL(void)
-jpeg_simple_progression (j_compress_ptr cinfo)
+jpeg_simple_progression(j_compress_ptr cinfo)
 {
   int ncomps = cinfo->num_components;
   int nscans;
@@ -498,7 +497,7 @@ jpeg_simple_progression (j_compress_ptr cinfo)
   if (cinfo->script_space == NULL || cinfo->script_space_size < nscans) {
     cinfo->script_space_size = MAX(nscans, 10);
     cinfo->script_space = (jpeg_scan_info *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                         cinfo->script_space_size * sizeof(jpeg_scan_info));
   }
   scanptr = cinfo->script_space;
diff --git a/media/libjpeg/jcphuff.c b/media/libjpeg/jcphuff.c
index 046e2e18d4..872e570bff 100644
--- a/media/libjpeg/jcphuff.c
+++ b/media/libjpeg/jcphuff.c
@@ -4,7 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
+ * Copyright (C) 2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ * Copyright (C) 2021, Alex Richardson.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -18,15 +21,74 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
-#include "jchuff.h"             /* Declarations shared with jchuff.c */
+#include "jsimd.h"
+#include "jconfigint.h"
+#include <limits.h>
+
+#ifdef HAVE_INTRIN_H
+#include <intrin.h>
+#ifdef _MSC_VER
+#ifdef HAVE_BITSCANFORWARD64
+#pragma intrinsic(_BitScanForward64)
+#endif
+#ifdef HAVE_BITSCANFORWARD
+#pragma intrinsic(_BitScanForward)
+#endif
+#endif
+#endif
 
 #ifdef C_PROGRESSIVE_SUPPORTED
 
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table.  This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.)  This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on Arm processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it.  When building for Armv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if (defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+#if !defined(__thumb__) || defined(__thumb2__)
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
+#define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
+#define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
+#include "jpeg_nbits_table.h"
+#define JPEG_NBITS(x)          (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x)  JPEG_NBITS(x)
+#endif
+
+
 /* Expanded entropy encoder object for progressive Huffman encoding. */
 
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
 
+  /* Pointer to routine to prepare data for encode_mcu_AC_first() */
+  void (*AC_first_prepare) (const JCOEF *block,
+                            const int *jpeg_natural_order_start, int Sl,
+                            int Al, JCOEF *values, size_t *zerobits);
+  /* Pointer to routine to prepare data for encode_mcu_AC_refine() */
+  int (*AC_refine_prepare) (const JCOEF *block,
+                            const int *jpeg_natural_order_start, int Sl,
+                            int Al, JCOEF *absvalues, size_t *bits);
+
   /* Mode flag: TRUE for optimization, FALSE for actual data output */
   boolean gather_statistics;
 
@@ -79,26 +141,62 @@ typedef phuff_entropy_encoder *phuff_entropy_ptr;
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define ISHIFT_TEMPS    int ishift_temp;
-#define IRIGHT_SHIFT(x,shft)  \
-        ((ishift_temp = (x)) < 0 ? \
-         (ishift_temp >> (shft)) | ((~0) << (16-(shft))) : \
-         (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~0) << (16 - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
+#define PAD(v, p)  ((v + (p) - 1) & (~((p) - 1)))
+
 /* Forward declarations */
-METHODDEF(boolean) encode_mcu_DC_first (j_compress_ptr cinfo,
+METHODDEF(boolean) encode_mcu_DC_first(j_compress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(void) encode_mcu_AC_first_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+METHODDEF(boolean) encode_mcu_AC_first(j_compress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) encode_mcu_DC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_first (j_compress_ptr cinfo,
+METHODDEF(int) encode_mcu_AC_refine_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
+METHODDEF(boolean) encode_mcu_AC_refine(j_compress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_DC_refine (j_compress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) encode_mcu_AC_refine (j_compress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(void) finish_pass_phuff (j_compress_ptr cinfo);
-METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_phuff(j_compress_ptr cinfo);
+METHODDEF(void) finish_pass_gather_phuff(j_compress_ptr cinfo);
+
+
+/* Count bit loop zeroes */
+INLINE
+METHODDEF(int)
+count_zeroes(size_t *x)
+{
+#if defined(HAVE_BUILTIN_CTZL)
+  int result;
+  result = __builtin_ctzl(*x);
+  *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD64)
+  unsigned long result;
+  _BitScanForward64(&result, *x);
+  *x >>= result;
+#elif defined(HAVE_BITSCANFORWARD)
+  unsigned long result;
+  _BitScanForward(&result, *x);
+  *x >>= result;
+#else
+  int result = 0;
+  while ((*x & 1) == 0) {
+    ++result;
+    *x >>= 1;
+  }
+#endif
+  return (int)result;
+}
 
 
 /*
@@ -106,9 +204,9 @@ METHODDEF(void) finish_pass_gather_phuff (j_compress_ptr cinfo);
  */
 
 METHODDEF(void)
-start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
+start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
   jpeg_component_info *compptr;
@@ -126,15 +224,23 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
       entropy->pub.encode_mcu = encode_mcu_DC_first;
     else
       entropy->pub.encode_mcu = encode_mcu_AC_first;
+    if (jsimd_can_encode_mcu_AC_first_prepare())
+      entropy->AC_first_prepare = jsimd_encode_mcu_AC_first_prepare;
+    else
+      entropy->AC_first_prepare = encode_mcu_AC_first_prepare;
   } else {
     if (is_DC_band)
       entropy->pub.encode_mcu = encode_mcu_DC_refine;
     else {
       entropy->pub.encode_mcu = encode_mcu_AC_refine;
+      if (jsimd_can_encode_mcu_AC_refine_prepare())
+        entropy->AC_refine_prepare = jsimd_encode_mcu_AC_refine_prepare;
+      else
+        entropy->AC_refine_prepare = encode_mcu_AC_refine_prepare;
       /* AC refinement needs a correction bit buffer */
       if (entropy->bit_buffer == NULL)
         entropy->bit_buffer = (char *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       MAX_CORR_BITS * sizeof(char));
     }
   }
@@ -167,14 +273,14 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
       /* Note that jpeg_gen_optimal_table expects 257 entries in each table! */
       if (entropy->count_ptrs[tbl] == NULL)
         entropy->count_ptrs[tbl] = (long *)
-          (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                       257 * sizeof(long));
-      MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
+      memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long));
     } else {
       /* Compute derived values for Huffman table */
       /* We may do this more than once for a table, but it's not expensive */
       jpeg_make_c_derived_tbl(cinfo, is_DC_band, tbl,
-                              & entropy->derived_tbls[tbl]);
+                              &entropy->derived_tbls[tbl]);
     }
   }
 
@@ -198,19 +304,20 @@ start_pass_phuff (j_compress_ptr cinfo, boolean gather_statistics)
  */
 
 /* Emit a byte */
-#define emit_byte(entropy,val)  \
-        { *(entropy)->next_output_byte++ = (JOCTET) (val);  \
-          if (--(entropy)->free_in_buffer == 0)  \
-            dump_buffer(entropy); }
+#define emit_byte(entropy, val) { \
+  *(entropy)->next_output_byte++ = (JOCTET)(val); \
+  if (--(entropy)->free_in_buffer == 0) \
+    dump_buffer(entropy); \
+}
 
 
 LOCAL(void)
-dump_buffer (phuff_entropy_ptr entropy)
+dump_buffer(phuff_entropy_ptr entropy)
 /* Empty the output buffer; we do not support suspension in this module. */
 {
   struct jpeg_destination_mgr *dest = entropy->cinfo->dest;
 
-  if (! (*dest->empty_output_buffer) (entropy->cinfo))
+  if (!(*dest->empty_output_buffer) (entropy->cinfo))
     ERREXIT(entropy->cinfo, JERR_CANT_SUSPEND);
   /* After a successful buffer dump, must reset buffer pointers */
   entropy->next_output_byte = dest->next_output_byte;
@@ -227,11 +334,11 @@ dump_buffer (phuff_entropy_ptr entropy)
  */
 
 LOCAL(void)
-emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
+emit_bits(phuff_entropy_ptr entropy, unsigned int code, int size)
 /* Emit some bits, unless we are in gather mode */
 {
   /* This routine is heavily used, so it's worth coding tightly. */
-  register size_t put_buffer = (size_t) code;
+  register size_t put_buffer = (size_t)code;
   register int put_bits = entropy->put_bits;
 
   /* if size is 0, caller used an invalid Huffman table entry */
@@ -241,7 +348,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
   if (entropy->gather_statistics)
     return;                     /* do nothing if we're only getting stats */
 
-  put_buffer &= (((size_t) 1)<<size) - 1; /* mask off any extra bits in code */
+  put_buffer &= (((size_t)1) << size) - 1; /* mask off any extra bits in code */
 
   put_bits += size;             /* new number of bits in buffer */
 
@@ -250,7 +357,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
   put_buffer |= entropy->put_buffer; /* and merge with old buffer contents */
 
   while (put_bits >= 8) {
-    int c = (int) ((put_buffer >> 16) & 0xFF);
+    int c = (int)((put_buffer >> 16) & 0xFF);
 
     emit_byte(entropy, c);
     if (c == 0xFF) {            /* need to stuff a zero byte? */
@@ -266,7 +373,7 @@ emit_bits (phuff_entropy_ptr entropy, unsigned int code, int size)
 
 
 LOCAL(void)
-flush_bits (phuff_entropy_ptr entropy)
+flush_bits(phuff_entropy_ptr entropy)
 {
   emit_bits(entropy, 0x7F, 7); /* fill any partial byte with ones */
   entropy->put_buffer = 0;     /* and reset bit-buffer to empty */
@@ -279,7 +386,7 @@ flush_bits (phuff_entropy_ptr entropy)
  */
 
 LOCAL(void)
-emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
+emit_symbol(phuff_entropy_ptr entropy, int tbl_no, int symbol)
 {
   if (entropy->gather_statistics)
     entropy->count_ptrs[tbl_no][symbol]++;
@@ -295,14 +402,14 @@ emit_symbol (phuff_entropy_ptr entropy, int tbl_no, int symbol)
  */
 
 LOCAL(void)
-emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
-                    unsigned int nbits)
+emit_buffered_bits(phuff_entropy_ptr entropy, char *bufstart,
+                   unsigned int nbits)
 {
   if (entropy->gather_statistics)
     return;                     /* no real work */
 
   while (nbits > 0) {
-    emit_bits(entropy, (unsigned int) (*bufstart), 1);
+    emit_bits(entropy, (unsigned int)(*bufstart), 1);
     bufstart++;
     nbits--;
   }
@@ -314,15 +421,13 @@ emit_buffered_bits (phuff_entropy_ptr entropy, char *bufstart,
  */
 
 LOCAL(void)
-emit_eobrun (phuff_entropy_ptr entropy)
+emit_eobrun(phuff_entropy_ptr entropy)
 {
   register int temp, nbits;
 
   if (entropy->EOBRUN > 0) {    /* if there is any pending EOBRUN */
     temp = entropy->EOBRUN;
-    nbits = 0;
-    while ((temp >>= 1))
-      nbits++;
+    nbits = JPEG_NBITS_NONZERO(temp) - 1;
     /* safety check: shouldn't happen given limited correction-bit buffer */
     if (nbits > 14)
       ERREXIT(entropy->cinfo, JERR_HUFF_MISSING_CODE);
@@ -345,13 +450,13 @@ emit_eobrun (phuff_entropy_ptr entropy)
  */
 
 LOCAL(void)
-emit_restart (phuff_entropy_ptr entropy, int restart_num)
+emit_restart(phuff_entropy_ptr entropy, int restart_num)
 {
   int ci;
 
   emit_eobrun(entropy);
 
-  if (! entropy->gather_statistics) {
+  if (!entropy->gather_statistics) {
     flush_bits(entropy);
     emit_byte(entropy, 0xFF);
     emit_byte(entropy, JPEG_RST0 + restart_num);
@@ -375,10 +480,10 @@ emit_restart (phuff_entropy_ptr entropy, int restart_num)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp, temp2;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+  register int temp, temp2, temp3;
   register int nbits;
   int blkn, ci;
   int Al = cinfo->Al;
@@ -403,31 +508,31 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Compute the DC value after the required point transform by Al.
      * This is simply an arithmetic right shift.
      */
-    temp2 = IRIGHT_SHIFT((int) ((*block)[0]), Al);
+    temp2 = IRIGHT_SHIFT((int)((*block)[0]), Al);
 
     /* DC differences are figured on the point-transformed values. */
     temp = temp2 - entropy->last_dc_val[ci];
     entropy->last_dc_val[ci] = temp2;
 
     /* Encode the DC coefficient difference per section G.1.2.1 */
-    temp2 = temp;
-    if (temp < 0) {
-      temp = -temp;             /* temp is abs value of input */
-      /* For a negative input, want temp2 = bitwise complement of abs(input) */
-      /* This code assumes we are on a two's complement machine */
-      temp2--;
-    }
+
+    /* This is a well-known technique for obtaining the absolute value without
+     * a branch.  It is derived from an assembly language technique presented
+     * in "How to Optimize for the Pentium Processors", Copyright (c) 1996,
+     * 1997 by Agner Fog.
+     */
+    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
+    temp ^= temp3;
+    temp -= temp3;              /* temp is abs value of input */
+    /* For a negative input, want temp2 = bitwise complement of abs(input) */
+    temp2 = temp ^ temp3;
 
     /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 0;
-    while (temp) {
-      nbits++;
-      temp >>= 1;
-    }
+    nbits = JPEG_NBITS(temp);
     /* Check for out-of-range coefficient values.
      * Since we're encoding a difference, the range limit is twice as much.
      */
-    if (nbits > MAX_COEF_BITS+1)
+    if (nbits > MAX_COEF_BITS + 1)
       ERREXIT(cinfo, JERR_BAD_DCT_COEF);
 
     /* Count/emit the Huffman-coded symbol for the number of bits */
@@ -436,7 +541,7 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     /* Emit that number of bits of the value, if positive, */
     /* or the complement of its magnitude, if negative. */
     if (nbits)                  /* emit_bits rejects calls with size 0 */
-      emit_bits(entropy, (unsigned int) temp2, nbits);
+      emit_bits(entropy, (unsigned int)temp2, nbits);
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -456,21 +561,116 @@ encode_mcu_DC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 }
 
 
+/*
+ * Data preparation for encode_mcu_AC_first().
+ */
+
+#define COMPUTE_ABSVALUES_AC_FIRST(Sl) { \
+  for (k = 0; k < Sl; k++) { \
+    temp = block[jpeg_natural_order_start[k]]; \
+    if (temp == 0) \
+      continue; \
+    /* We must apply the point transform by Al.  For AC coefficients this \
+     * is an integer division with rounding towards 0.  To do this portably \
+     * in C, we shift after obtaining the absolute value; so the code is \
+     * interwoven with finding the abs value (temp) and output bits (temp2). \
+     */ \
+    temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp ^= temp2; \
+    temp -= temp2;              /* temp is abs value of input */ \
+    temp >>= Al;                /* apply the point transform */ \
+    /* Watch out for case that nonzero coef is zero after point transform */ \
+    if (temp == 0) \
+      continue; \
+    /* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
+    temp2 ^= temp; \
+    values[k] = (JCOEF)temp; \
+    values[k + DCTSIZE2] = (JCOEF)temp2; \
+    zerobits |= ((size_t)1U) << k; \
+  } \
+}
+
+METHODDEF(void)
+encode_mcu_AC_first_prepare(const JCOEF *block,
+                            const int *jpeg_natural_order_start, int Sl,
+                            int Al, JCOEF *values, size_t *bits)
+{
+  register int k, temp, temp2;
+  size_t zerobits = 0U;
+  int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+  if (Sl0 > 32)
+    Sl0 = 32;
+#endif
+
+  COMPUTE_ABSVALUES_AC_FIRST(Sl0);
+
+  bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 4
+  zerobits = 0U;
+
+  if (Sl > 32) {
+    Sl -= 32;
+    jpeg_natural_order_start += 32;
+    values += 32;
+
+    COMPUTE_ABSVALUES_AC_FIRST(Sl);
+  }
+  bits[1] = zerobits;
+#endif
+}
+
 /*
  * MCU encoding for AC initial scan (either spectral selection,
  * or first pass of successive approximation).
  */
 
+#define ENCODE_COEFS_AC_FIRST(label) { \
+  while (zerobits) { \
+    r = count_zeroes(&zerobits); \
+    cvalue += r; \
+label \
+    temp  = cvalue[0]; \
+    temp2 = cvalue[DCTSIZE2]; \
+    \
+    /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+    while (r > 15) { \
+      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+      r -= 16; \
+    } \
+    \
+    /* Find the number of bits needed for the magnitude of the coefficient */ \
+    nbits = JPEG_NBITS_NONZERO(temp);  /* there must be at least one 1 bit */ \
+    /* Check for out-of-range coefficient values */ \
+    if (nbits > MAX_COEF_BITS) \
+      ERREXIT(cinfo, JERR_BAD_DCT_COEF); \
+    \
+    /* Count/emit Huffman symbol for run length / number of bits */ \
+    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits); \
+    \
+    /* Emit that number of bits of the value, if positive, */ \
+    /* or the complement of its magnitude, if negative. */ \
+    emit_bits(entropy, (unsigned int)temp2, nbits); \
+    \
+    cvalue++; \
+    zerobits >>= 1; \
+  } \
+}
+
 METHODDEF(boolean)
-encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   register int temp, temp2;
-  register int nbits;
-  register int r, k;
-  int Se = cinfo->Se;
+  register int nbits, r;
+  int Sl = cinfo->Se - cinfo->Ss + 1;
   int Al = cinfo->Al;
-  JBLOCKROW block;
+  JCOEF values_unaligned[2 * DCTSIZE2 + 15];
+  JCOEF *values;
+  const JCOEF *cvalue;
+  size_t zerobits;
+  size_t bits[8 / SIZEOF_SIZE_T];
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -480,66 +680,48 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->restarts_to_go == 0)
       emit_restart(entropy, entropy->next_restart_num);
 
-  /* Encode the MCU data block */
-  block = MCU_data[0];
+#ifdef WITH_SIMD
+  cvalue = values = (JCOEF *)PAD((JUINTPTR)values_unaligned, 16);
+#else
+  /* Not using SIMD, so alignment is not needed */
+  cvalue = values = values_unaligned;
+#endif
+
+  /* Prepare data */
+  entropy->AC_first_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+                            Sl, Al, values, bits);
+
+  zerobits = bits[0];
+#if SIZEOF_SIZE_T == 4
+  zerobits |= bits[1];
+#endif
+
+  /* Emit any pending EOBRUN */
+  if (zerobits && (entropy->EOBRUN > 0))
+    emit_eobrun(entropy);
+
+#if SIZEOF_SIZE_T == 4
+  zerobits = bits[0];
+#endif
 
   /* Encode the AC coefficients per section G.1.2.2, fig. G.3 */
 
-  r = 0;                        /* r = run length of zeros */
+  ENCODE_COEFS_AC_FIRST((void)0;);
 
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = (*block)[jpeg_natural_order[k]]) == 0) {
-      r++;
-      continue;
-    }
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value; so the code is
-     * interwoven with finding the abs value (temp) and output bits (temp2).
-     */
-    if (temp < 0) {
-      temp = -temp;             /* temp is abs value of input */
-      temp >>= Al;              /* apply the point transform */
-      /* For a negative coef, want temp2 = bitwise complement of abs(coef) */
-      temp2 = ~temp;
-    } else {
-      temp >>= Al;              /* apply the point transform */
-      temp2 = temp;
-    }
-    /* Watch out for case that nonzero coef is zero after point transform */
-    if (temp == 0) {
-      r++;
-      continue;
-    }
-
-    /* Emit any pending EOBRUN */
-    if (entropy->EOBRUN > 0)
-      emit_eobrun(entropy);
-    /* if run length > 15, must emit special run-length-16 codes (0xF0) */
-    while (r > 15) {
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-    }
-
-    /* Find the number of bits needed for the magnitude of the coefficient */
-    nbits = 1;                  /* there must be at least one 1 bit */
-    while ((temp >>= 1))
-      nbits++;
-    /* Check for out-of-range coefficient values */
-    if (nbits > MAX_COEF_BITS)
-      ERREXIT(cinfo, JERR_BAD_DCT_COEF);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + nbits);
-
-    /* Emit that number of bits of the value, if positive, */
-    /* or the complement of its magnitude, if negative. */
-    emit_bits(entropy, (unsigned int) temp2, nbits);
-
-    r = 0;                      /* reset zero run length */
+#if SIZEOF_SIZE_T == 4
+  zerobits = bits[1];
+  if (zerobits) {
+    int diff = ((values + DCTSIZE2 / 2) - cvalue);
+    r = count_zeroes(&zerobits);
+    r += diff;
+    cvalue += r;
+    goto first_iter_ac_first;
   }
 
-  if (r > 0) {                  /* If there are trailing zeroes, */
+  ENCODE_COEFS_AC_FIRST(first_iter_ac_first:);
+#endif
+
+  if (cvalue < (values + Sl)) { /* If there are trailing zeroes, */
     entropy->EOBRUN++;          /* count an EOB */
     if (entropy->EOBRUN == 0x7FFF)
       emit_eobrun(entropy);     /* force it out to avoid overflow */
@@ -569,9 +751,9 @@ encode_mcu_AC_first (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_DC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   register int temp;
   int blkn;
   int Al = cinfo->Al;
@@ -591,7 +773,7 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* We simply emit the Al'th bit of the DC coefficient value. */
     temp = (*block)[0];
-    emit_bits(entropy, (unsigned int) (temp >> Al), 1);
+    emit_bits(entropy, (unsigned int)(temp >> Al), 1);
   }
 
   cinfo->dest->next_output_byte = entropy->next_output_byte;
@@ -611,23 +793,149 @@ encode_mcu_DC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 }
 
 
+/*
+ * Data preparation for encode_mcu_AC_refine().
+ */
+
+#define COMPUTE_ABSVALUES_AC_REFINE(Sl, koffset) { \
+  /* It is convenient to make a pre-pass to determine the transformed \
+   * coefficients' absolute values and the EOB position. \
+   */ \
+  for (k = 0; k < Sl; k++) { \
+    temp = block[jpeg_natural_order_start[k]]; \
+    /* We must apply the point transform by Al.  For AC coefficients this \
+     * is an integer division with rounding towards 0.  To do this portably \
+     * in C, we shift after obtaining the absolute value. \
+     */ \
+    temp2 = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp ^= temp2; \
+    temp -= temp2;              /* temp is abs value of input */ \
+    temp >>= Al;                /* apply the point transform */ \
+    if (temp != 0) { \
+      zerobits |= ((size_t)1U) << k; \
+      signbits |= ((size_t)(temp2 + 1)) << k; \
+    } \
+    absvalues[k] = (JCOEF)temp; /* save abs value for main pass */ \
+    if (temp == 1) \
+      EOB = k + koffset;        /* EOB = index of last newly-nonzero coef */ \
+  } \
+}
+
+METHODDEF(int)
+encode_mcu_AC_refine_prepare(const JCOEF *block,
+                             const int *jpeg_natural_order_start, int Sl,
+                             int Al, JCOEF *absvalues, size_t *bits)
+{
+  register int k, temp, temp2;
+  int EOB = 0;
+  size_t zerobits = 0U, signbits = 0U;
+  int Sl0 = Sl;
+
+#if SIZEOF_SIZE_T == 4
+  if (Sl0 > 32)
+    Sl0 = 32;
+#endif
+
+  COMPUTE_ABSVALUES_AC_REFINE(Sl0, 0);
+
+  bits[0] = zerobits;
+#if SIZEOF_SIZE_T == 8
+  bits[1] = signbits;
+#else
+  bits[2] = signbits;
+
+  zerobits = 0U;
+  signbits = 0U;
+
+  if (Sl > 32) {
+    Sl -= 32;
+    jpeg_natural_order_start += 32;
+    absvalues += 32;
+
+    COMPUTE_ABSVALUES_AC_REFINE(Sl, 32);
+  }
+
+  bits[1] = zerobits;
+  bits[3] = signbits;
+#endif
+
+  return EOB;
+}
+
+
 /*
  * MCU encoding for AC successive approximation refinement scan.
  */
 
+#define ENCODE_COEFS_AC_REFINE(label) { \
+  while (zerobits) { \
+    idx = count_zeroes(&zerobits); \
+    r += idx; \
+    cabsvalue += idx; \
+    signbits >>= idx; \
+label \
+    /* Emit any required ZRLs, but not if they can be folded into EOB */ \
+    while (r > 15 && (cabsvalue <= EOBPTR)) { \
+      /* emit any pending EOBRUN and the BE correction bits */ \
+      emit_eobrun(entropy); \
+      /* Emit ZRL */ \
+      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0); \
+      r -= 16; \
+      /* Emit buffered correction bits that must be associated with ZRL */ \
+      emit_buffered_bits(entropy, BR_buffer, BR); \
+      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+      BR = 0; \
+    } \
+    \
+    temp = *cabsvalue++; \
+    \
+    /* If the coef was previously nonzero, it only needs a correction bit. \
+     * NOTE: a straight translation of the spec's figure G.7 would suggest \
+     * that we also need to test r > 15.  But if r > 15, we can only get here \
+     * if k > EOB, which implies that this coefficient is not 1. \
+     */ \
+    if (temp > 1) { \
+      /* The correction bit is the next bit of the absolute value. */ \
+      BR_buffer[BR++] = (char)(temp & 1); \
+      signbits >>= 1; \
+      zerobits >>= 1; \
+      continue; \
+    } \
+    \
+    /* Emit any pending EOBRUN and the BE correction bits */ \
+    emit_eobrun(entropy); \
+    \
+    /* Count/emit Huffman symbol for run length / number of bits */ \
+    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1); \
+    \
+    /* Emit output bit for newly-nonzero coef */ \
+    temp = signbits & 1; /* ((*block)[jpeg_natural_order_start[k]] < 0) ? 0 : 1 */ \
+    emit_bits(entropy, (unsigned int)temp, 1); \
+    \
+    /* Emit buffered correction bits that must be associated with this code */ \
+    emit_buffered_bits(entropy, BR_buffer, BR); \
+    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */ \
+    BR = 0; \
+    r = 0;                      /* reset zero run length */ \
+    signbits >>= 1; \
+    zerobits >>= 1; \
+  } \
+}
+
 METHODDEF(boolean)
-encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
+encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
-  register int temp;
-  register int r, k;
-  int EOB;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
+  register int temp, r, idx;
   char *BR_buffer;
   unsigned int BR;
-  int Se = cinfo->Se;
+  int Sl = cinfo->Se - cinfo->Ss + 1;
   int Al = cinfo->Al;
-  JBLOCKROW block;
-  int absvalues[DCTSIZE2];
+  JCOEF absvalues_unaligned[DCTSIZE2 + 15];
+  JCOEF *absvalues;
+  const JCOEF *cabsvalue, *EOBPTR;
+  size_t zerobits, signbits;
+  size_t bits[16 / SIZEOF_SIZE_T];
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -637,26 +945,17 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->restarts_to_go == 0)
       emit_restart(entropy, entropy->next_restart_num);
 
-  /* Encode the MCU data block */
-  block = MCU_data[0];
+#ifdef WITH_SIMD
+  cabsvalue = absvalues = (JCOEF *)PAD((JUINTPTR)absvalues_unaligned, 16);
+#else
+  /* Not using SIMD, so alignment is not needed */
+  cabsvalue = absvalues = absvalues_unaligned;
+#endif
 
-  /* It is convenient to make a pre-pass to determine the transformed
-   * coefficients' absolute values and the EOB position.
-   */
-  EOB = 0;
-  for (k = cinfo->Ss; k <= Se; k++) {
-    temp = (*block)[jpeg_natural_order[k]];
-    /* We must apply the point transform by Al.  For AC coefficients this
-     * is an integer division with rounding towards 0.  To do this portably
-     * in C, we shift after obtaining the absolute value.
-     */
-    if (temp < 0)
-      temp = -temp;             /* temp is abs value of input */
-    temp >>= Al;                /* apply the point transform */
-    absvalues[k] = temp;        /* save abs value for main pass */
-    if (temp == 1)
-      EOB = k;                  /* EOB = index of last newly-nonzero coef */
-  }
+  /* Prepare data */
+  EOBPTR = absvalues +
+    entropy->AC_refine_prepare(MCU_data[0][0], jpeg_natural_order + cinfo->Ss,
+                               Sl, Al, absvalues, bits);
 
   /* Encode the AC coefficients per section G.1.2.3, fig. G.7 */
 
@@ -664,53 +963,33 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   BR = 0;                       /* BR = count of buffered bits added now */
   BR_buffer = entropy->bit_buffer + entropy->BE; /* Append bits to buffer */
 
-  for (k = cinfo->Ss; k <= Se; k++) {
-    if ((temp = absvalues[k]) == 0) {
-      r++;
-      continue;
-    }
+  zerobits = bits[0];
+#if SIZEOF_SIZE_T == 8
+  signbits = bits[1];
+#else
+  signbits = bits[2];
+#endif
+  ENCODE_COEFS_AC_REFINE((void)0;);
 
-    /* Emit any required ZRLs, but not if they can be folded into EOB */
-    while (r > 15 && k <= EOB) {
-      /* emit any pending EOBRUN and the BE correction bits */
-      emit_eobrun(entropy);
-      /* Emit ZRL */
-      emit_symbol(entropy, entropy->ac_tbl_no, 0xF0);
-      r -= 16;
-      /* Emit buffered correction bits that must be associated with ZRL */
-      emit_buffered_bits(entropy, BR_buffer, BR);
-      BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-      BR = 0;
-    }
+#if SIZEOF_SIZE_T == 4
+  zerobits = bits[1];
+  signbits = bits[3];
 
-    /* If the coef was previously nonzero, it only needs a correction bit.
-     * NOTE: a straight translation of the spec's figure G.7 would suggest
-     * that we also need to test r > 15.  But if r > 15, we can only get here
-     * if k > EOB, which implies that this coefficient is not 1.
-     */
-    if (temp > 1) {
-      /* The correction bit is the next bit of the absolute value. */
-      BR_buffer[BR++] = (char) (temp & 1);
-      continue;
-    }
-
-    /* Emit any pending EOBRUN and the BE correction bits */
-    emit_eobrun(entropy);
-
-    /* Count/emit Huffman symbol for run length / number of bits */
-    emit_symbol(entropy, entropy->ac_tbl_no, (r << 4) + 1);
-
-    /* Emit output bit for newly-nonzero coef */
-    temp = ((*block)[jpeg_natural_order[k]] < 0) ? 0 : 1;
-    emit_bits(entropy, (unsigned int) temp, 1);
-
-    /* Emit buffered correction bits that must be associated with this code */
-    emit_buffered_bits(entropy, BR_buffer, BR);
-    BR_buffer = entropy->bit_buffer; /* BE bits are gone now */
-    BR = 0;
-    r = 0;                      /* reset zero run length */
+  if (zerobits) {
+    int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
+    idx = count_zeroes(&zerobits);
+    signbits >>= idx;
+    idx += diff;
+    r += idx;
+    cabsvalue += idx;
+    goto first_iter_ac_refine;
   }
 
+  ENCODE_COEFS_AC_REFINE(first_iter_ac_refine:);
+#endif
+
+  r |= (int)((absvalues + Sl) - cabsvalue);
+
   if (r > 0 || BR > 0) {        /* If there are trailing zeroes, */
     entropy->EOBRUN++;          /* count an EOB */
     entropy->BE += BR;          /* concat my correction bits to older ones */
@@ -718,7 +997,8 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
      * 1. overflow of the EOB counter;
      * 2. overflow of the correction bit buffer during the next MCU.
      */
-    if (entropy->EOBRUN == 0x7FFF || entropy->BE > (MAX_CORR_BITS-DCTSIZE2+1))
+    if (entropy->EOBRUN == 0x7FFF ||
+        entropy->BE > (MAX_CORR_BITS - DCTSIZE2 + 1))
       emit_eobrun(entropy);
   }
 
@@ -744,9 +1024,9 @@ encode_mcu_AC_refine (j_compress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-finish_pass_phuff (j_compress_ptr cinfo)
+finish_pass_phuff(j_compress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
 
   entropy->next_output_byte = cinfo->dest->next_output_byte;
   entropy->free_in_buffer = cinfo->dest->free_in_buffer;
@@ -765,9 +1045,9 @@ finish_pass_phuff (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass_gather_phuff (j_compress_ptr cinfo)
+finish_pass_gather_phuff(j_compress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band;
   int ci, tbl;
   jpeg_component_info *compptr;
@@ -782,7 +1062,7 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
   /* It's important not to apply jpeg_gen_optimal_table more than once
    * per table, because it clobbers the input frequency counts!
    */
-  MEMZERO(did, sizeof(did));
+  memset(did, 0, sizeof(did));
 
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
@@ -793,13 +1073,13 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
     } else {
       tbl = compptr->ac_tbl_no;
     }
-    if (! did[tbl]) {
+    if (!did[tbl]) {
       if (is_DC_band)
-        htblptr = & cinfo->dc_huff_tbl_ptrs[tbl];
+        htblptr = &cinfo->dc_huff_tbl_ptrs[tbl];
       else
-        htblptr = & cinfo->ac_huff_tbl_ptrs[tbl];
+        htblptr = &cinfo->ac_huff_tbl_ptrs[tbl];
       if (*htblptr == NULL)
-        *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+        *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
       jpeg_gen_optimal_table(cinfo, *htblptr, entropy->count_ptrs[tbl]);
       did[tbl] = TRUE;
     }
@@ -812,15 +1092,15 @@ finish_pass_gather_phuff (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_phuff_encoder (j_compress_ptr cinfo)
+jinit_phuff_encoder(j_compress_ptr cinfo)
 {
   phuff_entropy_ptr entropy;
   int i;
 
   entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(phuff_entropy_encoder));
-  cinfo->entropy = (struct jpeg_entropy_encoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_encoder *)entropy;
   entropy->pub.start_pass = start_pass_phuff;
 
   /* Mark tables unallocated */
diff --git a/media/libjpeg/jcprepct.c b/media/libjpeg/jcprepct.c
index e72ebd87d2..f27cc34507 100644
--- a/media/libjpeg/jcprepct.c
+++ b/media/libjpeg/jcprepct.c
@@ -3,8 +3,8 @@
  *
  * This file is part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -78,9 +78,9 @@ typedef my_prep_controller *my_prep_ptr;
  */
 
 METHODDEF(void)
-start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_prep(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
 
   if (pass_mode != JBUF_PASS_THRU)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -106,14 +106,14 @@ start_pass_prep (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 LOCAL(void)
-expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
-                    int input_rows, int output_rows)
+expand_bottom_edge(JSAMPARRAY image_data, JDIMENSION num_cols, int input_rows,
+                   int output_rows)
 {
   register int row;
 
   for (row = input_rows; row < output_rows; row++) {
-    jcopy_sample_rows(image_data, input_rows-1, image_data, row,
-                      1, num_cols);
+    jcopy_sample_rows(image_data, input_rows - 1, image_data, row, 1,
+                      num_cols);
   }
 }
 
@@ -128,13 +128,12 @@ expand_bottom_edge (JSAMPARRAY image_data, JDIMENSION num_cols,
  */
 
 METHODDEF(void)
-pre_process_data (j_compress_ptr cinfo,
-                  JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                  JDIMENSION in_rows_avail,
-                  JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-                  JDIMENSION out_row_groups_avail)
+pre_process_data(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                 JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+                 JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                 JDIMENSION out_row_groups_avail)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int numrows, ci;
   JDIMENSION inrows;
   jpeg_component_info *compptr;
@@ -144,10 +143,10 @@ pre_process_data (j_compress_ptr cinfo,
     /* Do color conversion to fill the conversion buffer. */
     inrows = in_rows_avail - *in_row_ctr;
     numrows = cinfo->max_v_samp_factor - prep->next_buf_row;
-    numrows = (int) MIN((JDIMENSION) numrows, inrows);
+    numrows = (int)MIN((JDIMENSION)numrows, inrows);
     (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
                                        prep->color_buf,
-                                       (JDIMENSION) prep->next_buf_row,
+                                       (JDIMENSION)prep->next_buf_row,
                                        numrows);
     *in_row_ctr += numrows;
     prep->next_buf_row += numrows;
@@ -164,7 +163,7 @@ pre_process_data (j_compress_ptr cinfo,
     /* If we've filled the conversion buffer, empty it. */
     if (prep->next_buf_row == cinfo->max_v_samp_factor) {
       (*cinfo->downsample->downsample) (cinfo,
-                                        prep->color_buf, (JDIMENSION) 0,
+                                        prep->color_buf, (JDIMENSION)0,
                                         output_buf, *out_row_group_ctr);
       prep->next_buf_row = 0;
       (*out_row_group_ctr)++;
@@ -172,14 +171,12 @@ pre_process_data (j_compress_ptr cinfo,
     /* If at bottom of image, pad the output to a full iMCU height.
      * Note we assume the caller is providing a one-iMCU-height output buffer!
      */
-    if (prep->rows_to_go == 0 &&
-        *out_row_group_ctr < out_row_groups_avail) {
+    if (prep->rows_to_go == 0 && *out_row_group_ctr < out_row_groups_avail) {
       for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
            ci++, compptr++) {
-        expand_bottom_edge(output_buf[ci],
-                           compptr->width_in_blocks * DCTSIZE,
-                           (int) (*out_row_group_ctr * compptr->v_samp_factor),
-                           (int) (out_row_groups_avail * compptr->v_samp_factor));
+        expand_bottom_edge(output_buf[ci], compptr->width_in_blocks * DCTSIZE,
+                           (int)(*out_row_group_ctr * compptr->v_samp_factor),
+                           (int)(out_row_groups_avail * compptr->v_samp_factor));
       }
       *out_row_group_ctr = out_row_groups_avail;
       break;                    /* can exit outer loop without test */
@@ -195,13 +192,12 @@ pre_process_data (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-pre_process_context (j_compress_ptr cinfo,
-                     JSAMPARRAY input_buf, JDIMENSION *in_row_ctr,
-                     JDIMENSION in_rows_avail,
-                     JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
-                     JDIMENSION out_row_groups_avail)
+pre_process_context(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                    JDIMENSION *in_row_ctr, JDIMENSION in_rows_avail,
+                    JSAMPIMAGE output_buf, JDIMENSION *out_row_group_ctr,
+                    JDIMENSION out_row_groups_avail)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int numrows, ci;
   int buf_height = cinfo->max_v_samp_factor * 3;
   JDIMENSION inrows;
@@ -211,19 +207,18 @@ pre_process_context (j_compress_ptr cinfo,
       /* Do color conversion to fill the conversion buffer. */
       inrows = in_rows_avail - *in_row_ctr;
       numrows = prep->next_buf_stop - prep->next_buf_row;
-      numrows = (int) MIN((JDIMENSION) numrows, inrows);
+      numrows = (int)MIN((JDIMENSION)numrows, inrows);
       (*cinfo->cconvert->color_convert) (cinfo, input_buf + *in_row_ctr,
                                          prep->color_buf,
-                                         (JDIMENSION) prep->next_buf_row,
+                                         (JDIMENSION)prep->next_buf_row,
                                          numrows);
       /* Pad at top of image, if first time through */
       if (prep->rows_to_go == cinfo->image_height) {
         for (ci = 0; ci < cinfo->num_components; ci++) {
           int row;
           for (row = 1; row <= cinfo->max_v_samp_factor; row++) {
-            jcopy_sample_rows(prep->color_buf[ci], 0,
-                              prep->color_buf[ci], -row,
-                              1, cinfo->image_width);
+            jcopy_sample_rows(prep->color_buf[ci], 0, prep->color_buf[ci],
+                              -row, 1, cinfo->image_width);
           }
         }
       }
@@ -245,9 +240,8 @@ pre_process_context (j_compress_ptr cinfo,
     }
     /* If we've gotten enough data, downsample a row group. */
     if (prep->next_buf_row == prep->next_buf_stop) {
-      (*cinfo->downsample->downsample) (cinfo,
-                                        prep->color_buf,
-                                        (JDIMENSION) prep->this_row_group,
+      (*cinfo->downsample->downsample) (cinfo, prep->color_buf,
+                                        (JDIMENSION)prep->this_row_group,
                                         output_buf, *out_row_group_ctr);
       (*out_row_group_ctr)++;
       /* Advance pointers with wraparound as necessary. */
@@ -267,9 +261,9 @@ pre_process_context (j_compress_ptr cinfo,
  */
 
 LOCAL(void)
-create_context_buffer (j_compress_ptr cinfo)
+create_context_buffer(j_compress_ptr cinfo)
 {
-  my_prep_ptr prep = (my_prep_ptr) cinfo->prep;
+  my_prep_ptr prep = (my_prep_ptr)cinfo->prep;
   int rgroup_height = cinfo->max_v_samp_factor;
   int ci, i;
   jpeg_component_info *compptr;
@@ -279,7 +273,7 @@ create_context_buffer (j_compress_ptr cinfo)
    * we need five row groups' worth of pointers for each component.
    */
   fake_buffer = (JSAMPARRAY)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (cinfo->num_components * 5 * rgroup_height) *
                                 sizeof(JSAMPROW));
 
@@ -290,13 +284,13 @@ create_context_buffer (j_compress_ptr cinfo)
      * horizontally within the buffer, if it so chooses.
      */
     true_buffer = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-                      cinfo->max_h_samp_factor) / compptr->h_samp_factor),
-       (JDIMENSION) (3 * rgroup_height));
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+                     cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+       (JDIMENSION)(3 * rgroup_height));
     /* Copy true buffer row pointers into the middle of the fake row array */
-    MEMCOPY(fake_buffer + rgroup_height, true_buffer,
-            3 * rgroup_height * sizeof(JSAMPROW));
+    memcpy(fake_buffer + rgroup_height, true_buffer,
+           3 * rgroup_height * sizeof(JSAMPROW));
     /* Fill in the above and below wraparound pointers */
     for (i = 0; i < rgroup_height; i++) {
       fake_buffer[i] = true_buffer[2 * rgroup_height + i];
@@ -315,7 +309,7 @@ create_context_buffer (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
+jinit_c_prep_controller(j_compress_ptr cinfo, boolean need_full_buffer)
 {
   my_prep_ptr prep;
   int ci;
@@ -325,9 +319,9 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
 
   prep = (my_prep_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_prep_controller));
-  cinfo->prep = (struct jpeg_c_prep_controller *) prep;
+  cinfo->prep = (struct jpeg_c_prep_controller *)prep;
   prep->pub.start_pass = start_pass_prep;
 
   /* Allocate the color conversion buffer.
@@ -348,10 +342,10 @@ jinit_c_prep_controller (j_compress_ptr cinfo, boolean need_full_buffer)
     for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
          ci++, compptr++) {
       prep->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-         (JDIMENSION) (((long) compptr->width_in_blocks * DCTSIZE *
-                        cinfo->max_h_samp_factor) / compptr->h_samp_factor),
-         (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
+         (JDIMENSION)(((long)compptr->width_in_blocks * DCTSIZE *
+                       cinfo->max_h_samp_factor) / compptr->h_samp_factor),
+         (JDIMENSION)cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/media/libjpeg/jcsample.c b/media/libjpeg/jcsample.c
index c4b4991487..e8515ebf0f 100644
--- a/media/libjpeg/jcsample.c
+++ b/media/libjpeg/jcsample.c
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -79,7 +79,7 @@ typedef my_downsampler *my_downsample_ptr;
  */
 
 METHODDEF(void)
-start_pass_downsample (j_compress_ptr cinfo)
+start_pass_downsample(j_compress_ptr cinfo)
 {
   /* no work for now */
 }
@@ -91,19 +91,19 @@ start_pass_downsample (j_compress_ptr cinfo)
  */
 
 LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
-                   JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
   register int count;
   int row;
-  int numcols = (int) (output_cols - input_cols);
+  int numcols = (int)(output_cols - input_cols);
 
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
@@ -118,11 +118,11 @@ expand_right_edge (JSAMPARRAY image_data, int num_rows,
  */
 
 METHODDEF(void)
-sep_downsample (j_compress_ptr cinfo,
-                JSAMPIMAGE input_buf, JDIMENSION in_row_index,
-                JSAMPIMAGE output_buf, JDIMENSION out_row_group_index)
+sep_downsample(j_compress_ptr cinfo, JSAMPIMAGE input_buf,
+               JDIMENSION in_row_index, JSAMPIMAGE output_buf,
+               JDIMENSION out_row_group_index)
 {
-  my_downsample_ptr downsample = (my_downsample_ptr) cinfo->downsample;
+  my_downsample_ptr downsample = (my_downsample_ptr)cinfo->downsample;
   int ci;
   jpeg_component_info *compptr;
   JSAMPARRAY in_ptr, out_ptr;
@@ -144,8 +144,8 @@ sep_downsample (j_compress_ptr cinfo,
  */
 
 METHODDEF(void)
-int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                JSAMPARRAY input_data, JSAMPARRAY output_data)
+int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+               JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, h_expand, v_expand, numpix, numpix2, h, v;
   JDIMENSION outcol, outcol_h;  /* outcol_h == outcol*h_expand */
@@ -156,14 +156,14 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   h_expand = cinfo->max_h_samp_factor / compptr->h_samp_factor;
   v_expand = cinfo->max_v_samp_factor / compptr->v_samp_factor;
   numpix = h_expand * v_expand;
-  numpix2 = numpix/2;
+  numpix2 = numpix / 2;
 
   /* Expand input data enough to let all the output samples be generated
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * h_expand);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * h_expand);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
@@ -172,12 +172,12 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
          outcol++, outcol_h += h_expand) {
       outvalue = 0;
       for (v = 0; v < v_expand; v++) {
-        inptr = input_data[inrow+v] + outcol_h;
+        inptr = input_data[inrow + v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (JLONG) GETJSAMPLE(*inptr++);
+          outvalue += (JLONG)(*inptr++);
         }
       }
-      *outptr++ = (JSAMPLE) ((outvalue + numpix2) / numpix);
+      *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
     }
     inrow += v_expand;
   }
@@ -191,15 +191,15 @@ int_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   /* Copy the data */
-  jcopy_sample_rows(input_data, 0, output_data, 0,
-                    cinfo->max_v_samp_factor, cinfo->image_width);
+  jcopy_sample_rows(input_data, 0, output_data, 0, cinfo->max_v_samp_factor,
+                    cinfo->image_width);
   /* Edge-expand */
-  expand_right_edge(output_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, compptr->width_in_blocks * DCTSIZE);
+  expand_right_edge(output_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    compptr->width_in_blocks * DCTSIZE);
 }
 
 
@@ -216,8 +216,8 @@ fullsize_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION outcol;
@@ -229,16 +229,15 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * 2);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * 2);
 
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1])
-                              + bias) >> 1);
+      *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -253,8 +252,8 @@ h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                 JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION outcol;
@@ -266,21 +265,20 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
    * by the standard loop.  Special-casing padded output would be more
    * efficient.
    */
-  expand_right_edge(input_data, cinfo->max_v_samp_factor,
-                    cinfo->image_width, output_cols * 2);
+  expand_right_edge(input_data, cinfo->max_v_samp_factor, cinfo->image_width,
+                    output_cols * 2);
 
   inrow = 0;
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow+1];
+    inptr1 = input_data[inrow + 1];
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ = (JSAMPLE) ((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                              GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1])
-                              + bias) >> 2);
+      *outptr++ =
+        (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
-      inptr0 += 2; inptr1 += 2;
+      inptr0 += 2;  inptr1 += 2;
     }
     inrow += 2;
   }
@@ -296,8 +294,8 @@ h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                        JSAMPARRAY input_data, JSAMPARRAY output_data)
+h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow;
   JDIMENSION colctr;
@@ -332,57 +330,45 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow+1];
-    above_ptr = input_data[inrow-1];
-    below_ptr = input_data[inrow+2];
+    inptr1 = input_data[inrow + 1];
+    above_ptr = input_data[inrow - 1];
+    below_ptr = input_data[inrow + 2];
 
     /* Special case for first column: pretend column -1 is same as column 0 */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
-               GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
-                GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+    neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-    inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+    inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
       /* sum of pixels directly mapped to this output element */
-      membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+      membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
       /* sum of edge-neighbor pixels */
-      neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-                 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
-                 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+      neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+                 inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
       /* The edge-neighbors count twice as much as corner-neighbors */
       neighsum += neighsum;
       /* Add in the corner-neighbors */
-      neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
-                  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+      neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
-      *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-      inptr0 += 2; inptr1 += 2; above_ptr += 2; below_ptr += 2;
+      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+      inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
-               GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
-                GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+    neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
     inrow += 2;
   }
@@ -396,8 +382,8 @@ h2v2_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                            JSAMPARRAY input_data, JSAMPARRAY output_data)
+fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                           JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int outrow;
   JDIMENSION colctr;
@@ -425,36 +411,33 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
   for (outrow = 0; outrow < compptr->v_samp_factor; outrow++) {
     outptr = output_data[outrow];
     inptr = input_data[outrow];
-    above_ptr = input_data[outrow-1];
-    below_ptr = input_data[outrow+1];
+    above_ptr = input_data[outrow - 1];
+    below_ptr = input_data[outrow + 1];
 
     /* Special case for first column */
-    colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
-             GETJSAMPLE(*inptr);
-    membersum = GETJSAMPLE(*inptr++);
-    nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                 GETJSAMPLE(*inptr);
+    colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+    membersum = *inptr++;
+    nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-    lastcolsum = colsum; colsum = nextcolsum;
+    *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+    lastcolsum = colsum;  colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
-      membersum = GETJSAMPLE(*inptr++);
-      above_ptr++; below_ptr++;
-      nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                   GETJSAMPLE(*inptr);
+      membersum = *inptr++;
+      above_ptr++;  below_ptr++;
+      nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
-      *outptr++ = (JSAMPLE) ((membersum + 32768) >> 16);
-      lastcolsum = colsum; colsum = nextcolsum;
+      *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
+      lastcolsum = colsum;  colsum = nextcolsum;
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr);
+    membersum = *inptr;
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
-    *outptr = (JSAMPLE) ((membersum + 32768) >> 16);
+    *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
   }
 }
@@ -468,7 +451,7 @@ fullsize_smooth_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_downsampler (j_compress_ptr cinfo)
+jinit_downsampler(j_compress_ptr cinfo)
 {
   my_downsample_ptr downsample;
   int ci;
@@ -476,9 +459,9 @@ jinit_downsampler (j_compress_ptr cinfo)
   boolean smoothok = TRUE;
 
   downsample = (my_downsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_downsampler));
-  cinfo->downsample = (struct jpeg_downsampler *) downsample;
+  cinfo->downsample = (struct jpeg_downsampler *)downsample;
   downsample->pub.start_pass = start_pass_downsample;
   downsample->pub.downsample = sep_downsample;
   downsample->pub.need_context_rows = FALSE;
diff --git a/media/libjpeg/jctrans.c b/media/libjpeg/jctrans.c
index 6f16b052cf..e121028ec7 100644
--- a/media/libjpeg/jctrans.c
+++ b/media/libjpeg/jctrans.c
@@ -4,8 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1998, Thomas G. Lane.
  * Modified 2000-2009 by Guido Vollbeding.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,13 +17,14 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /* Forward declarations */
-LOCAL(void) transencode_master_selection
-        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
-LOCAL(void) transencode_coef_controller
-        (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_master_selection(j_compress_ptr cinfo,
+                                         jvirt_barray_ptr *coef_arrays);
+LOCAL(void) transencode_coef_controller(j_compress_ptr cinfo,
+                                        jvirt_barray_ptr *coef_arrays);
 
 
 /*
@@ -39,14 +40,14 @@ LOCAL(void) transencode_coef_controller
  */
 
 GLOBAL(void)
-jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
+jpeg_write_coefficients(j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
 {
   if (cinfo->global_state != CSTATE_START)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   /* Mark all tables to be written */
   jpeg_suppress_tables(cinfo, FALSE);
   /* (Re)initialize error mgr and destination modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->dest->init_destination) (cinfo);
   /* Perform master selection of active modules */
   transencode_master_selection(cinfo, coef_arrays);
@@ -64,8 +65,7 @@ jpeg_write_coefficients (j_compress_ptr cinfo, jvirt_barray_ptr *coef_arrays)
  */
 
 GLOBAL(void)
-jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
-                               j_compress_ptr dstinfo)
+jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
 {
   JQUANT_TBL **qtblptr;
   jpeg_component_info *incomp, *outcomp;
@@ -97,12 +97,11 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
   /* Copy the source's quantization tables. */
   for (tblno = 0; tblno < NUM_QUANT_TBLS; tblno++) {
     if (srcinfo->quant_tbl_ptrs[tblno] != NULL) {
-      qtblptr = & dstinfo->quant_tbl_ptrs[tblno];
+      qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
       if (*qtblptr == NULL)
-        *qtblptr = jpeg_alloc_quant_table((j_common_ptr) dstinfo);
-      MEMCOPY((*qtblptr)->quantval,
-              srcinfo->quant_tbl_ptrs[tblno]->quantval,
-              sizeof((*qtblptr)->quantval));
+        *qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
+      memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
+             sizeof((*qtblptr)->quantval));
       (*qtblptr)->sent_table = FALSE;
     }
   }
@@ -165,8 +164,8 @@ jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
  */
 
 LOCAL(void)
-transencode_master_selection (j_compress_ptr cinfo,
-                              jvirt_barray_ptr *coef_arrays)
+transencode_master_selection(j_compress_ptr cinfo,
+                             jvirt_barray_ptr *coef_arrays)
 {
   /* Although we don't actually use input_components for transcoding,
    * jcmaster.c's initial_setup will complain if input_components is 0.
@@ -199,7 +198,7 @@ transencode_master_selection (j_compress_ptr cinfo,
   jinit_marker_writer(cinfo);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Write the datastream header (SOI, JFIF) immediately.
    * Frame and scan headers are postponed till later.
@@ -238,10 +237,10 @@ typedef my_coef_controller *my_coef_ptr;
 
 
 LOCAL(void)
-start_iMCU_row (j_compress_ptr cinfo)
+start_iMCU_row(j_compress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -250,7 +249,7 @@ start_iMCU_row (j_compress_ptr cinfo)
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows-1))
+    if (coef->iMCU_row_num < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
@@ -266,9 +265,9 @@ start_iMCU_row (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_coef(j_compress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   if (pass_mode != JBUF_CRANK_DEST)
     ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -289,9 +288,9 @@ start_pass_coef (j_compress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(boolean)
-compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
+compress_output(j_compress_ptr cinfo, JSAMPIMAGE input_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -306,9 +305,9 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        coef->iMCU_row_num * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
   }
 
   /* Loop to process one whole iMCU row */
@@ -321,13 +320,13 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
-        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                : compptr->last_col_width;
+        blockcnt = (MCU_col_num < last_MCU_col) ? compptr->MCU_width :
+                                                  compptr->last_col_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
           if (coef->iMCU_row_num < last_iMCU_row ||
-              yindex+yoffset < compptr->last_row_height) {
+              yindex + yoffset < compptr->last_row_height) {
             /* Fill in pointers to real blocks in this row */
-            buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+            buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
             for (xindex = 0; xindex < blockcnt; xindex++)
               MCU_buffer[blkn++] = buffer_ptr++;
           } else {
@@ -342,13 +341,13 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
            */
           for (; xindex < compptr->MCU_width; xindex++) {
             MCU_buffer[blkn] = coef->dummy_buffer[blkn];
-            MCU_buffer[blkn][0][0] = MCU_buffer[blkn-1][0][0];
+            MCU_buffer[blkn][0][0] = MCU_buffer[blkn - 1][0][0];
             blkn++;
           }
         }
       }
       /* Try to write the MCU. */
-      if (! (*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
+      if (!(*cinfo->entropy->encode_mcu) (cinfo, MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->mcu_ctr = MCU_col_num;
@@ -374,17 +373,17 @@ compress_output (j_compress_ptr cinfo, JSAMPIMAGE input_buf)
  */
 
 LOCAL(void)
-transencode_coef_controller (j_compress_ptr cinfo,
-                             jvirt_barray_ptr *coef_arrays)
+transencode_coef_controller(j_compress_ptr cinfo,
+                            jvirt_barray_ptr *coef_arrays)
 {
   my_coef_ptr coef;
   JBLOCKROW buffer;
   int i;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_c_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_c_coef_controller *)coef;
   coef->pub.start_pass = start_pass_coef;
   coef->pub.compress_data = compress_output;
 
@@ -393,9 +392,9 @@ transencode_coef_controller (j_compress_ptr cinfo,
 
   /* Allocate and pre-zero space for dummy DCT blocks. */
   buffer = (JBLOCKROW)
-    (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
-  jzero_far((void *) buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
+  jzero_far((void *)buffer, C_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
   for (i = 0; i < C_MAX_BLOCKS_IN_MCU; i++) {
     coef->dummy_buffer[i] = buffer + i;
   }
diff --git a/media/libjpeg/jdapimin.c b/media/libjpeg/jdapimin.c
index f80a14667f..f50c27edc3 100644
--- a/media/libjpeg/jdapimin.c
+++ b/media/libjpeg/jdapimin.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
+ * Copyright (C) 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,6 +23,7 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdmaster.h"
+#include "jconfigint.h"
 
 
 /*
@@ -31,7 +32,7 @@
  */
 
 GLOBAL(void)
-jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
+jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
 {
   int i;
 
@@ -41,7 +42,7 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
     ERREXIT2(cinfo, JERR_BAD_LIB_VERSION, JPEG_LIB_VERSION, version);
   if (structsize != sizeof(struct jpeg_decompress_struct))
     ERREXIT2(cinfo, JERR_BAD_STRUCT_SIZE,
-             (int) sizeof(struct jpeg_decompress_struct), (int) structsize);
+             (int)sizeof(struct jpeg_decompress_struct), (int)structsize);
 
   /* For debugging purposes, we zero the whole master structure.
    * But the application has already set the err pointer, and may have set
@@ -50,16 +51,16 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
    * complain here.
    */
   {
-    struct jpeg_error_mgr * err = cinfo->err;
-    void * client_data = cinfo->client_data; /* ignore Purify complaint here */
-    MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
+    struct jpeg_error_mgr *err = cinfo->err;
+    void *client_data = cinfo->client_data; /* ignore Purify complaint here */
+    memset(cinfo, 0, sizeof(struct jpeg_decompress_struct));
     cinfo->err = err;
     cinfo->client_data = client_data;
   }
   cinfo->is_decompressor = TRUE;
 
   /* Initialize a memory manager instance for this object */
-  jinit_memory_mgr((j_common_ptr) cinfo);
+  jinit_memory_mgr((j_common_ptr)cinfo);
 
   /* Zero out pointers to permanent structures. */
   cinfo->progress = NULL;
@@ -89,9 +90,9 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
    * here.
    */
   cinfo->master = (struct jpeg_decomp_master *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
-                                  sizeof(my_decomp_master));
-  MEMZERO(cinfo->master, sizeof(my_decomp_master));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
+                                sizeof(my_decomp_master));
+  memset(cinfo->master, 0, sizeof(my_decomp_master));
 }
 
 
@@ -100,9 +101,9 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
  */
 
 GLOBAL(void)
-jpeg_destroy_decompress (j_decompress_ptr cinfo)
+jpeg_destroy_decompress(j_decompress_ptr cinfo)
 {
-  jpeg_destroy((j_common_ptr) cinfo); /* use common routine */
+  jpeg_destroy((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -112,9 +113,9 @@ jpeg_destroy_decompress (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_abort_decompress (j_decompress_ptr cinfo)
+jpeg_abort_decompress(j_decompress_ptr cinfo)
 {
-  jpeg_abort((j_common_ptr) cinfo); /* use common routine */
+  jpeg_abort((j_common_ptr)cinfo); /* use common routine */
 }
 
 
@@ -123,7 +124,7 @@ jpeg_abort_decompress (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-default_decompress_parms (j_decompress_ptr cinfo)
+default_decompress_parms(j_decompress_ptr cinfo)
 {
   /* Guess the input colorspace, and set output colorspace accordingly. */
   /* (Wish JPEG committee had provided a real way to specify this...) */
@@ -250,7 +251,7 @@ default_decompress_parms (j_decompress_ptr cinfo)
  */
 
 GLOBAL(int)
-jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
+jpeg_read_header(j_decompress_ptr cinfo, boolean require_image)
 {
   int retcode;
 
@@ -271,7 +272,7 @@ jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
      * call jpeg_abort, but we can't change it now for compatibility reasons.
      * A side effect is to free any temporary memory (there shouldn't be any).
      */
-    jpeg_abort((j_common_ptr) cinfo); /* sets state = DSTATE_START */
+    jpeg_abort((j_common_ptr)cinfo); /* sets state = DSTATE_START */
     retcode = JPEG_HEADER_TABLES_ONLY;
     break;
   case JPEG_SUSPENDED:
@@ -296,7 +297,7 @@ jpeg_read_header (j_decompress_ptr cinfo, boolean require_image)
  */
 
 GLOBAL(int)
-jpeg_consume_input (j_decompress_ptr cinfo)
+jpeg_consume_input(j_decompress_ptr cinfo)
 {
   int retcode = JPEG_SUSPENDED;
 
@@ -308,7 +309,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
     /* Initialize application's data source module */
     (*cinfo->src->init_source) (cinfo);
     cinfo->global_state = DSTATE_INHEADER;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case DSTATE_INHEADER:
     retcode = (*cinfo->inputctl->consume_input) (cinfo);
     if (retcode == JPEG_REACHED_SOS) { /* Found SOS, prepare to decompress */
@@ -343,7 +344,7 @@ jpeg_consume_input (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_input_complete (j_decompress_ptr cinfo)
+jpeg_input_complete(j_decompress_ptr cinfo)
 {
   /* Check for valid jpeg object */
   if (cinfo->global_state < DSTATE_START ||
@@ -358,7 +359,7 @@ jpeg_input_complete (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_has_multiple_scans (j_decompress_ptr cinfo)
+jpeg_has_multiple_scans(j_decompress_ptr cinfo)
 {
   /* Only valid after jpeg_read_header completes */
   if (cinfo->global_state < DSTATE_READY ||
@@ -378,10 +379,10 @@ jpeg_has_multiple_scans (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_finish_decompress (j_decompress_ptr cinfo)
+jpeg_finish_decompress(j_decompress_ptr cinfo)
 {
   if ((cinfo->global_state == DSTATE_SCANNING ||
-       cinfo->global_state == DSTATE_RAW_OK) && ! cinfo->buffered_image) {
+       cinfo->global_state == DSTATE_RAW_OK) && !cinfo->buffered_image) {
     /* Terminate final pass of non-buffered mode */
     if (cinfo->output_scanline < cinfo->output_height)
       ERREXIT(cinfo, JERR_TOO_LITTLE_DATA);
@@ -395,13 +396,13 @@ jpeg_finish_decompress (j_decompress_ptr cinfo)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
   }
   /* Read until EOI */
-  while (! cinfo->inputctl->eoi_reached) {
+  while (!cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return FALSE;             /* Suspend, come back later */
   }
   /* Do final cleanup */
   (*cinfo->src->term_source) (cinfo);
   /* We can use jpeg_abort to release memory and reset global_state */
-  jpeg_abort((j_common_ptr) cinfo);
+  jpeg_abort((j_common_ptr)cinfo);
   return TRUE;
 }
diff --git a/media/libjpeg/jdapistd.c b/media/libjpeg/jdapistd.c
index 37afc8448b..8827d8abf5 100644
--- a/media/libjpeg/jdapistd.c
+++ b/media/libjpeg/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -21,11 +21,13 @@
 #include "jinclude.h"
 #include "jdmainct.h"
 #include "jdcoefct.h"
+#include "jdmaster.h"
+#include "jdmerge.h"
 #include "jdsample.h"
 #include "jmemsys.h"
 
 /* Forward declarations */
-LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
+LOCAL(boolean) output_pass_setup(j_decompress_ptr cinfo);
 
 
 /*
@@ -40,7 +42,7 @@ LOCAL(boolean) output_pass_setup (j_decompress_ptr cinfo);
  */
 
 GLOBAL(boolean)
-jpeg_start_decompress (j_decompress_ptr cinfo)
+jpeg_start_decompress(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state == DSTATE_READY) {
     /* First call: initialize master control, select active modules */
@@ -60,7 +62,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
         int retcode;
         /* Call progress monitor hook if present */
         if (cinfo->progress != NULL)
-          (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+          (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
         /* Absorb some more input */
         retcode = (*cinfo->inputctl->consume_input) (cinfo);
         if (retcode == JPEG_SUSPENDED)
@@ -72,7 +74,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
             (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
           if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
             /* jdmaster underestimated number of scans; ratchet up one scan */
-            cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+            cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
           }
         }
       }
@@ -97,7 +99,7 @@ jpeg_start_decompress (j_decompress_ptr cinfo)
  */
 
 LOCAL(boolean)
-output_pass_setup (j_decompress_ptr cinfo)
+output_pass_setup(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state != DSTATE_PRESCAN) {
     /* First call: do pass setup */
@@ -113,14 +115,14 @@ output_pass_setup (j_decompress_ptr cinfo)
       JDIMENSION last_scanline;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL) {
-        cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-        cinfo->progress->pass_limit = (long) cinfo->output_height;
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+        cinfo->progress->pass_limit = (long)cinfo->output_height;
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       }
       /* Process some data */
       last_scanline = cinfo->output_scanline;
-      (*cinfo->main->process_data) (cinfo, (JSAMPARRAY) NULL,
-                                    &cinfo->output_scanline, (JDIMENSION) 0);
+      (*cinfo->main->process_data) (cinfo, (JSAMPARRAY)NULL,
+                                    &cinfo->output_scanline, (JDIMENSION)0);
       if (cinfo->output_scanline == last_scanline)
         return FALSE;           /* No progress made, must suspend */
     }
@@ -150,13 +152,14 @@ output_pass_setup (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
-                    JDIMENSION *width)
+jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                   JDIMENSION *width)
 {
   int ci, align, orig_downsampled_width;
   JDIMENSION input_xoffset;
   boolean reinit_upsampler = FALSE;
   jpeg_component_info *compptr;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
@@ -190,7 +193,10 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
    * single-pass decompression case, allowing us to use the same MCU column
    * width for all of the components.
    */
-  align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
+  if (cinfo->comps_in_scan == 1 && cinfo->num_components == 1)
+    align = cinfo->_min_DCT_scaled_size;
+  else
+    align = cinfo->_min_DCT_scaled_size * cinfo->max_h_samp_factor;
 
   /* Adjust xoffset to the nearest iMCU boundary <= the requested value */
   input_xoffset = *xoffset;
@@ -203,24 +209,31 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
    */
   *width = *width + input_xoffset - *xoffset;
   cinfo->output_width = *width;
+  if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+    my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+    upsample->out_row_width =
+      cinfo->output_width * cinfo->out_color_components;
+  }
 
   /* Set the first and last iMCU columns that we must decompress.  These values
    * will be used in single-scan decompressions.
    */
-  cinfo->master->first_iMCU_col =
-    (JDIMENSION) (long) (*xoffset) / (long) align;
+  cinfo->master->first_iMCU_col = (JDIMENSION)(long)(*xoffset) / (long)align;
   cinfo->master->last_iMCU_col =
-    (JDIMENSION) jdiv_round_up((long) (*xoffset + cinfo->output_width),
-                               (long) align) - 1;
+    (JDIMENSION)jdiv_round_up((long)(*xoffset + cinfo->output_width),
+                              (long)align) - 1;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
+    int hsf = (cinfo->comps_in_scan == 1 && cinfo->num_components == 1) ?
+              1 : compptr->h_samp_factor;
+
     /* Set downsampled_width to the new output width. */
     orig_downsampled_width = compptr->downsampled_width;
     compptr->downsampled_width =
-      (JDIMENSION) jdiv_round_up((long) (cinfo->output_width *
-                                         compptr->h_samp_factor),
-                                 (long) cinfo->max_h_samp_factor);
+      (JDIMENSION)jdiv_round_up((long)(cinfo->output_width *
+                                       compptr->h_samp_factor),
+                                (long)cinfo->max_h_samp_factor);
     if (compptr->downsampled_width < 2 && orig_downsampled_width >= 2)
       reinit_upsampler = TRUE;
 
@@ -228,12 +241,10 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
      * values will be used in multi-scan decompressions.
      */
     cinfo->master->first_MCU_col[ci] =
-      (JDIMENSION) (long) (*xoffset * compptr->h_samp_factor) /
-                   (long) align;
+      (JDIMENSION)(long)(*xoffset * hsf) / (long)align;
     cinfo->master->last_MCU_col[ci] =
-      (JDIMENSION) jdiv_round_up((long) ((*xoffset + cinfo->output_width) *
-                                         compptr->h_samp_factor),
-                                 (long) align) - 1;
+      (JDIMENSION)jdiv_round_up((long)((*xoffset + cinfo->output_width) * hsf),
+                                (long)align) - 1;
   }
 
   if (reinit_upsampler) {
@@ -258,8 +269,8 @@ jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
-                     JDIMENSION max_lines)
+jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                    JDIMENSION max_lines)
 {
   JDIMENSION row_ctr;
 
@@ -272,9 +283,9 @@ jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->output_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->output_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Process some data */
@@ -287,8 +298,16 @@ jpeg_read_scanlines (j_decompress_ptr cinfo, JSAMPARRAY scanlines,
 
 /* Dummy color convert function used by jpeg_skip_scanlines() */
 LOCAL(void)
-noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-              JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+noop_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
+{
+}
+
+
+/* Dummy quantize function used by jpeg_skip_scanlines() */
+LOCAL(void)
+noop_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+              JSAMPARRAY output_buf, int num_rows)
 {
 }
 
@@ -302,20 +321,46 @@ noop_convert (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
  */
 
 LOCAL(void)
-read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
   JDIMENSION n;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
+  JSAMPLE dummy_sample[1] = { 0 };
+  JSAMPROW dummy_row = dummy_sample;
+  JSAMPARRAY scanlines = NULL;
   void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                          JDIMENSION input_row, JSAMPARRAY output_buf,
-                         int num_rows);
+                         int num_rows) = NULL;
+  void (*color_quantize) (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                          JSAMPARRAY output_buf, int num_rows) = NULL;
 
-  color_convert = cinfo->cconvert->color_convert;
-  cinfo->cconvert->color_convert = noop_convert;
+  if (cinfo->cconvert && cinfo->cconvert->color_convert) {
+    color_convert = cinfo->cconvert->color_convert;
+    cinfo->cconvert->color_convert = noop_convert;
+    /* This just prevents UBSan from complaining about adding 0 to a NULL
+     * pointer.  The pointer isn't actually used.
+     */
+    scanlines = &dummy_row;
+  }
+
+  if (cinfo->cquantize && cinfo->cquantize->color_quantize) {
+    color_quantize = cinfo->cquantize->color_quantize;
+    cinfo->cquantize->color_quantize = noop_quantize;
+  }
+
+  if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+    my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
+    scanlines = &upsample->spare_row;
+  }
 
   for (n = 0; n < num_lines; n++)
-    jpeg_read_scanlines(cinfo, NULL, 1);
+    jpeg_read_scanlines(cinfo, scanlines, 1);
 
-  cinfo->cconvert->color_convert = color_convert;
+  if (color_convert)
+    cinfo->cconvert->color_convert = color_convert;
+
+  if (color_quantize)
+    cinfo->cquantize->color_quantize = color_quantize;
 }
 
 
@@ -325,10 +370,16 @@ read_and_discard_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
  */
 
 LOCAL(void)
-increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
+increment_simple_rowgroup_ctr(j_decompress_ptr cinfo, JDIMENSION rows)
 {
   JDIMENSION rows_left;
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
+
+  if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
+    read_and_discard_scanlines(cinfo, rows);
+    return;
+  }
 
   /* Increment the counter to the next row group after the skipped rows. */
   main_ptr->rowgroup_ctr += rows / cinfo->max_v_samp_factor;
@@ -354,23 +405,31 @@ increment_simple_rowgroup_ctr (j_decompress_ptr cinfo, JDIMENSION rows)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
+jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   JDIMENSION i, x;
   int y;
   JDIMENSION lines_per_iMCU_row, lines_left_in_iMCU_row, lines_after_iMCU_row;
   JDIMENSION lines_to_skip, lines_to_read;
 
+  /* Two-pass color quantization is not supported. */
+  if (cinfo->quantize_colors && cinfo->two_pass_quantize)
+    ERREXIT(cinfo, JERR_NOTIMPL);
+
   if (cinfo->global_state != DSTATE_SCANNING)
     ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
 
   /* Do not skip past the bottom of the image. */
   if (cinfo->output_scanline + num_lines >= cinfo->output_height) {
+    num_lines = cinfo->output_height - cinfo->output_scanline;
     cinfo->output_scanline = cinfo->output_height;
-    return cinfo->output_height - cinfo->output_scanline;
+    (*cinfo->inputctl->finish_input_pass) (cinfo);
+    cinfo->inputctl->eoi_reached = TRUE;
+    return num_lines;
   }
 
   if (num_lines == 0)
@@ -419,8 +478,10 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
     main_ptr->buffer_full = FALSE;
     main_ptr->rowgroup_ctr = 0;
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
-    upsample->next_row_out = cinfo->max_v_samp_factor;
-    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    if (!master->using_merged_upsample) {
+      upsample->next_row_out = cinfo->max_v_samp_factor;
+      upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    }
   }
 
   /* Skipping is much simpler when context rows are not required. */
@@ -432,8 +493,10 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
       cinfo->output_scanline += lines_left_in_iMCU_row;
       main_ptr->buffer_full = FALSE;
       main_ptr->rowgroup_ctr = 0;
-      upsample->next_row_out = cinfo->max_v_samp_factor;
-      upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+      if (!master->using_merged_upsample) {
+        upsample->next_row_out = cinfo->max_v_samp_factor;
+        upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+      }
     }
   }
 
@@ -458,7 +521,7 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
     if (cinfo->upsample->need_context_rows) {
       cinfo->output_scanline += lines_to_skip;
       cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
-      main_ptr->iMCU_row_ctr += lines_after_iMCU_row / lines_per_iMCU_row;
+      main_ptr->iMCU_row_ctr += lines_to_skip / lines_per_iMCU_row;
       /* It is complex to properly move to the middle of a context block, so
        * read the remaining lines instead of skipping them.
        */
@@ -468,7 +531,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
       cinfo->output_iMCU_row += lines_to_skip / lines_per_iMCU_row;
       increment_simple_rowgroup_ctr(cinfo, lines_to_read);
     }
-    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+    if (!master->using_merged_upsample)
+      upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
     return num_lines;
   }
 
@@ -480,6 +544,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
          * decoded coefficients.  This is ~5% faster for large subsets, but
          * it's tough to tell a difference for smaller images.
          */
+        if (!cinfo->entropy->insufficient_data)
+          cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
         (*cinfo->entropy->decode_mcu) (cinfo, NULL);
       }
     }
@@ -509,7 +575,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
    * bit odd, since "rows_to_go" seems to be redundantly keeping track of
    * output_scanline.
    */
-  upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
+  if (!master->using_merged_upsample)
+    upsample->rows_to_go = cinfo->output_height - cinfo->output_scanline;
 
   /* Always skip the requested number of lines. */
   return num_lines;
@@ -521,8 +588,8 @@ jpeg_skip_scanlines (j_decompress_ptr cinfo, JDIMENSION num_lines)
  */
 
 GLOBAL(JDIMENSION)
-jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
-                    JDIMENSION max_lines)
+jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                   JDIMENSION max_lines)
 {
   JDIMENSION lines_per_iMCU_row;
 
@@ -535,9 +602,9 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
 
   /* Call progress monitor hook if present */
   if (cinfo->progress != NULL) {
-    cinfo->progress->pass_counter = (long) cinfo->output_scanline;
-    cinfo->progress->pass_limit = (long) cinfo->output_height;
-    (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+    cinfo->progress->pass_counter = (long)cinfo->output_scanline;
+    cinfo->progress->pass_limit = (long)cinfo->output_height;
+    (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
   }
 
   /* Verify that at least one iMCU row can be returned. */
@@ -546,7 +613,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
 
   /* Decompress directly into user's buffer. */
-  if (! (*cinfo->coef->decompress_data) (cinfo, data))
+  if (!(*cinfo->coef->decompress_data) (cinfo, data))
     return 0;                   /* suspension forced, can do nothing more */
 
   /* OK, we processed one iMCU row. */
@@ -564,7 +631,7 @@ jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
  */
 
 GLOBAL(boolean)
-jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
+jpeg_start_output(j_decompress_ptr cinfo, int scan_number)
 {
   if (cinfo->global_state != DSTATE_BUFIMAGE &&
       cinfo->global_state != DSTATE_PRESCAN)
@@ -572,8 +639,7 @@ jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
   /* Limit scan number to valid range */
   if (scan_number <= 0)
     scan_number = 1;
-  if (cinfo->inputctl->eoi_reached &&
-      scan_number > cinfo->input_scan_number)
+  if (cinfo->inputctl->eoi_reached && scan_number > cinfo->input_scan_number)
     scan_number = cinfo->input_scan_number;
   cinfo->output_scan_number = scan_number;
   /* Perform any dummy output passes, and set up for the real pass */
@@ -589,7 +655,7 @@ jpeg_start_output (j_decompress_ptr cinfo, int scan_number)
  */
 
 GLOBAL(boolean)
-jpeg_finish_output (j_decompress_ptr cinfo)
+jpeg_finish_output(j_decompress_ptr cinfo)
 {
   if ((cinfo->global_state == DSTATE_SCANNING ||
        cinfo->global_state == DSTATE_RAW_OK) && cinfo->buffered_image) {
@@ -603,7 +669,7 @@ jpeg_finish_output (j_decompress_ptr cinfo)
   }
   /* Read markers looking for SOS or EOI */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-         ! cinfo->inputctl->eoi_reached) {
+         !cinfo->inputctl->eoi_reached) {
     if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return FALSE;             /* Suspend, come back later */
   }
diff --git a/media/libjpeg/jdarith.c b/media/libjpeg/jdarith.c
index df3540eef5..21575e80c7 100644
--- a/media/libjpeg/jdarith.c
+++ b/media/libjpeg/jdarith.c
@@ -4,16 +4,19 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2020, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
- * This file contains portable arithmetic entropy decoding routines for JPEG
- * (implementing the ISO/IEC IS 10918-1 and CCITT Recommendation ITU-T T.81).
+ * This file contains portable arithmetic entropy encoding routines for JPEG
+ * (implementing Recommendation ITU-T T.81 | ISO/IEC 10918-1).
  *
  * Both sequential and progressive modes are supported in this single module.
  *
  * Suspension is not currently supported in this module.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -21,6 +24,9 @@
 #include "jpeglib.h"
 
 
+#define NEG_1  ((unsigned int)-1)
+
+
 /* Expanded entropy decoder object for arithmetic decoding. */
 
 typedef struct {
@@ -60,21 +66,21 @@ typedef arith_entropy_decoder *arith_entropy_ptr;
  * in the lower bits (mask 0x7F).
  */
 
-#define DC_STAT_BINS 64
-#define AC_STAT_BINS 256
+#define DC_STAT_BINS  64
+#define AC_STAT_BINS  256
 
 
 LOCAL(int)
-get_byte (j_decompress_ptr cinfo)
+get_byte(j_decompress_ptr cinfo)
 /* Read next input byte; we do not support suspension in this module. */
 {
   struct jpeg_source_mgr *src = cinfo->src;
 
   if (src->bytes_in_buffer == 0)
-    if (! (*src->fill_input_buffer) (cinfo))
+    if (!(*src->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   src->bytes_in_buffer--;
-  return GETJOCTET(*src->next_input_byte++);
+  return *src->next_input_byte++;
 }
 
 
@@ -106,9 +112,9 @@ get_byte (j_decompress_ptr cinfo)
  */
 
 LOCAL(int)
-arith_decode (j_decompress_ptr cinfo, unsigned char *st)
+arith_decode(j_decompress_ptr cinfo, unsigned char *st)
 {
-  register arith_entropy_ptr e = (arith_entropy_ptr) cinfo->entropy;
+  register arith_entropy_ptr e = (arith_entropy_ptr)cinfo->entropy;
   register unsigned char nl, nm;
   register JLONG qe, temp;
   register int sv, data;
@@ -153,8 +159,8 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st)
    */
   sv = *st;
   qe = jpeg_aritab[sv & 0x7F];  /* => Qe_Value */
-  nl = qe & 0xFF; qe >>= 8;     /* Next_Index_LPS + Switch_MPS */
-  nm = qe & 0xFF; qe >>= 8;     /* Next_Index_MPS */
+  nl = qe & 0xFF;  qe >>= 8;    /* Next_Index_LPS + Switch_MPS */
+  nm = qe & 0xFF;  qe >>= 8;    /* Next_Index_MPS */
 
   /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
   temp = e->a - qe;
@@ -190,27 +196,27 @@ arith_decode (j_decompress_ptr cinfo, unsigned char *st)
  */
 
 LOCAL(void)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci;
   jpeg_component_info *compptr;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     ERREXIT(cinfo, JERR_CANT_SUSPEND);
 
   /* Re-initialize statistics areas */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
-      MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
+      memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
       /* Reset DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
     }
     if (!cinfo->progressive_mode || cinfo->Ss) {
-      MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
+      memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
     }
   }
 
@@ -241,9 +247,9 @@ process_restart (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int blkn, ci, tbl, sign;
@@ -277,7 +283,7 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       /* Figure F.21: Decoding nonzero value v */
       /* Figure F.22: Decoding the sign of v */
       sign = arith_decode(cinfo, st + 1);
-      st += 2; st += sign;
+      st += 2;  st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
         st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
@@ -291,9 +297,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;               /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
         entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
@@ -302,12 +308,12 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
-      entropy->last_dc_val[ci] += v;
+      v += 1;  if (sign) v = -v;
+      entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
     }
 
     /* Scale and output the DC coefficient (assumes jpeg_natural_order[0]=0) */
-    (*block)[0] = (JCOEF) LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
+    (*block)[0] = (JCOEF)LEFT_SHIFT(entropy->last_dc_val[ci], cinfo->Al);
   }
 
   return TRUE;
@@ -320,9 +326,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   unsigned char *st;
   int tbl, sign, k;
@@ -348,7 +354,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     st = entropy->ac_stats[tbl] + 3 * (k - 1);
     if (arith_decode(cinfo, st)) break;         /* EOB flag */
     while (arith_decode(cinfo, st + 1) == 0) {
-      st += 3; k++;
+      st += 3;  k++;
       if (k > cinfo->Se) {
         WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
         entropy->ct = -1;                       /* spectral overflow */
@@ -380,9 +386,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     st += 14;
     while (m >>= 1)
       if (arith_decode(cinfo, st)) v |= m;
-    v += 1; if (sign) v = -v;
+    v += 1;  if (sign) v = -v;
     /* Scale and output coefficient in natural (dezigzagged) order */
-    (*block)[jpeg_natural_order[k]] = (JCOEF) ((unsigned)v << cinfo->Al);
+    (*block)[jpeg_natural_order[k]] = (JCOEF)((unsigned)v << cinfo->Al);
   }
 
   return TRUE;
@@ -394,9 +400,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   unsigned char *st;
   int p1, blkn;
 
@@ -427,9 +433,9 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   JBLOCKROW block;
   JCOEFPTR thiscoef;
   unsigned char *st;
@@ -450,7 +456,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   tbl = cinfo->cur_comp_info[0]->ac_tbl_no;
 
   p1 = 1 << cinfo->Al;          /* 1 in the bit position being coded */
-  m1 = (-1) << cinfo->Al;       /* -1 in the bit position being coded */
+  m1 = (NEG_1) << cinfo->Al;    /* -1 in the bit position being coded */
 
   /* Establish EOBx (previous stage end-of-block) index */
   for (kex = cinfo->Se; kex > 0; kex--)
@@ -465,20 +471,20 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       if (*thiscoef) {                          /* previously nonzero coef */
         if (arith_decode(cinfo, st + 2)) {
           if (*thiscoef < 0)
-            *thiscoef += m1;
+            *thiscoef += (JCOEF)m1;
           else
-            *thiscoef += p1;
+            *thiscoef += (JCOEF)p1;
         }
         break;
       }
       if (arith_decode(cinfo, st + 1)) {        /* newly nonzero coef */
         if (arith_decode(cinfo, entropy->fixed_bin))
-          *thiscoef = m1;
+          *thiscoef = (JCOEF)m1;
         else
-          *thiscoef = p1;
+          *thiscoef = (JCOEF)p1;
         break;
       }
-      st += 3; k++;
+      st += 3;  k++;
       if (k > cinfo->Se) {
         WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
         entropy->ct = -1;                       /* spectral overflow */
@@ -496,9 +502,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   jpeg_component_info *compptr;
   JBLOCKROW block;
   unsigned char *st;
@@ -535,7 +541,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       /* Figure F.21: Decoding nonzero value v */
       /* Figure F.22: Decoding the sign of v */
       sign = arith_decode(cinfo, st + 1);
-      st += 2; st += sign;
+      st += 2;  st += sign;
       /* Figure F.23: Decoding the magnitude category of v */
       if ((m = arith_decode(cinfo, st)) != 0) {
         st = entropy->dc_stats[tbl] + 20;       /* Table F.4: X1 = 20 */
@@ -549,9 +555,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         }
       }
       /* Section F.1.4.4.1.2: Establish dc_context conditioning category */
-      if (m < (int) ((1L << cinfo->arith_dc_L[tbl]) >> 1))
+      if (m < (int)((1L << cinfo->arith_dc_L[tbl]) >> 1))
         entropy->dc_context[ci] = 0;               /* zero diff category */
-      else if (m > (int) ((1L << cinfo->arith_dc_U[tbl]) >> 1))
+      else if (m > (int)((1L << cinfo->arith_dc_U[tbl]) >> 1))
         entropy->dc_context[ci] = 12 + (sign * 4); /* large diff category */
       else
         entropy->dc_context[ci] = 4 + (sign * 4);  /* small diff category */
@@ -560,12 +566,12 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
-      entropy->last_dc_val[ci] += v;
+      v += 1;  if (sign) v = -v;
+      entropy->last_dc_val[ci] = (entropy->last_dc_val[ci] + v) & 0xffff;
     }
 
     if (block)
-      (*block)[0] = (JCOEF) entropy->last_dc_val[ci];
+      (*block)[0] = (JCOEF)entropy->last_dc_val[ci];
 
     /* Sections F.2.4.2 & F.1.4.4.2: Decoding of AC coefficients */
 
@@ -576,7 +582,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st = entropy->ac_stats[tbl] + 3 * (k - 1);
       if (arith_decode(cinfo, st)) break;       /* EOB flag */
       while (arith_decode(cinfo, st + 1) == 0) {
-        st += 3; k++;
+        st += 3;  k++;
         if (k > DCTSIZE2 - 1) {
           WARNMS(cinfo, JWRN_ARITH_BAD_CODE);
           entropy->ct = -1;                     /* spectral overflow */
@@ -608,9 +614,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       st += 14;
       while (m >>= 1)
         if (arith_decode(cinfo, st)) v |= m;
-      v += 1; if (sign) v = -v;
+      v += 1;  if (sign) v = -v;
       if (block)
-        (*block)[jpeg_natural_order[k]] = (JCOEF) v;
+        (*block)[jpeg_natural_order[k]] = (JCOEF)v;
     }
   }
 
@@ -623,9 +629,9 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
 {
-  arith_entropy_ptr entropy = (arith_entropy_ptr) cinfo->entropy;
+  arith_entropy_ptr entropy = (arith_entropy_ptr)cinfo->entropy;
   int ci, tbl;
   jpeg_component_info *compptr;
 
@@ -644,11 +650,11 @@ start_pass (j_decompress_ptr cinfo)
     }
     if (cinfo->Ah != 0) {
       /* Successive approximation refinement scan: must have Al = Ah-1. */
-      if (cinfo->Ah-1 != cinfo->Al)
+      if (cinfo->Ah - 1 != cinfo->Al)
         goto bad;
     }
     if (cinfo->Al > 13) {       /* need not check for < 0 */
-      bad:
+bad:
       ERREXIT4(cinfo, JERR_BAD_PROGRESSION,
                cinfo->Ss, cinfo->Se, cinfo->Ah, cinfo->Al);
     }
@@ -658,9 +664,17 @@ start_pass (j_decompress_ptr cinfo)
      */
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
-      int *coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+      int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+      int *prev_coef_bit_ptr =
+        &cinfo->coef_bits[cindex + cinfo->num_components][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+        if (cinfo->input_scan_number > 1)
+          prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+        else
+          prev_coef_bit_ptr[coefi] = 0;
+      }
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
         int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
         if (cinfo->Ah != expected)
@@ -684,8 +698,8 @@ start_pass (j_decompress_ptr cinfo)
     /* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
      * This ought to be an error condition, but we make it a warning.
      */
-    if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
-        (cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
+    if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
+        cinfo->Ah != 0 || cinfo->Al != 0)
       WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
     /* Select MCU decoding routine */
     entropy->pub.decode_mcu = decode_mcu;
@@ -699,9 +713,9 @@ start_pass (j_decompress_ptr cinfo)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->dc_stats[tbl] == NULL)
-        entropy->dc_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, DC_STAT_BINS);
-      MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
+        entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
+      memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
       /* Initialize DC predictions to 0 */
       entropy->last_dc_val[ci] = 0;
       entropy->dc_context[ci] = 0;
@@ -711,9 +725,9 @@ start_pass (j_decompress_ptr cinfo)
       if (tbl < 0 || tbl >= NUM_ARITH_TBLS)
         ERREXIT1(cinfo, JERR_NO_ARITH_TABLE, tbl);
       if (entropy->ac_stats[tbl] == NULL)
-        entropy->ac_stats[tbl] = (unsigned char *) (*cinfo->mem->alloc_small)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, AC_STAT_BINS);
-      MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
+        entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
+      memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
     }
   }
 
@@ -721,6 +735,7 @@ start_pass (j_decompress_ptr cinfo)
   entropy->c = 0;
   entropy->a = 0;
   entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
+  entropy->pub.insufficient_data = FALSE;
 
   /* Initialize restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -732,15 +747,15 @@ start_pass (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_arith_decoder (j_decompress_ptr cinfo)
+jinit_arith_decoder(j_decompress_ptr cinfo)
 {
   arith_entropy_ptr entropy;
   int i;
 
   entropy = (arith_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(arith_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass;
 
   /* Mark tables unallocated */
@@ -756,9 +771,10 @@ jinit_arith_decoder (j_decompress_ptr cinfo)
     /* Create progression status table */
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components*DCTSIZE2*sizeof(int));
-    coef_bit_ptr = & cinfo->coef_bits[0][0];
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  cinfo->num_components * 2 * DCTSIZE2 *
+                                  sizeof(int));
+    coef_bit_ptr = &cinfo->coef_bits[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
       for (i = 0; i < DCTSIZE2; i++)
         *coef_bit_ptr++ = -1;
diff --git a/media/libjpeg/jdatadst.c b/media/libjpeg/jdatadst.c
index dcaf6f0f96..6b4fed2339 100644
--- a/media/libjpeg/jdatadst.c
+++ b/media/libjpeg/jdatadst.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2012 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -23,11 +23,6 @@
 #include "jpeglib.h"
 #include "jerror.h"
 
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
-#endif
-
 
 /* Expanded data destination object for stdio output */
 
@@ -66,14 +61,14 @@ typedef my_mem_destination_mgr *my_mem_dest_ptr;
  */
 
 METHODDEF(void)
-init_destination (j_compress_ptr cinfo)
+init_destination(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
 
   /* Allocate the output buffer --- it will be released when done with image */
   dest->buffer = (JOCTET *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  OUTPUT_BUF_SIZE * sizeof(JOCTET));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                OUTPUT_BUF_SIZE * sizeof(JOCTET));
 
   dest->pub.next_output_byte = dest->buffer;
   dest->pub.free_in_buffer = OUTPUT_BUF_SIZE;
@@ -81,7 +76,7 @@ init_destination (j_compress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-init_mem_destination (j_compress_ptr cinfo)
+init_mem_destination(j_compress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -112,12 +107,12 @@ init_mem_destination (j_compress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-empty_output_buffer (j_compress_ptr cinfo)
+empty_output_buffer(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
 
-  if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) !=
-      (size_t) OUTPUT_BUF_SIZE)
+  if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) !=
+      (size_t)OUTPUT_BUF_SIZE)
     ERREXIT(cinfo, JERR_FILE_WRITE);
 
   dest->pub.next_output_byte = dest->buffer;
@@ -128,23 +123,22 @@ empty_output_buffer (j_compress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
-empty_mem_output_buffer (j_compress_ptr cinfo)
+empty_mem_output_buffer(j_compress_ptr cinfo)
 {
   size_t nextsize;
   JOCTET *nextbuffer;
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   /* Try to allocate new buffer with double size */
   nextsize = dest->bufsize * 2;
-  nextbuffer = (JOCTET *) malloc(nextsize);
+  nextbuffer = (JOCTET *)malloc(nextsize);
 
   if (nextbuffer == NULL)
     ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
 
-  MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
+  memcpy(nextbuffer, dest->buffer, dest->bufsize);
 
-  if (dest->newbuffer != NULL)
-    free(dest->newbuffer);
+  free(dest->newbuffer);
 
   dest->newbuffer = nextbuffer;
 
@@ -169,14 +163,14 @@ empty_mem_output_buffer (j_compress_ptr cinfo)
  */
 
 METHODDEF(void)
-term_destination (j_compress_ptr cinfo)
+term_destination(j_compress_ptr cinfo)
 {
-  my_dest_ptr dest = (my_dest_ptr) cinfo->dest;
+  my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
   size_t datacount = OUTPUT_BUF_SIZE - dest->pub.free_in_buffer;
 
   /* Write any data remaining in the buffer */
   if (datacount > 0) {
-    if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
+    if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount)
       ERREXIT(cinfo, JERR_FILE_WRITE);
   }
   fflush(dest->outfile);
@@ -187,9 +181,9 @@ term_destination (j_compress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-term_mem_destination (j_compress_ptr cinfo)
+term_mem_destination(j_compress_ptr cinfo)
 {
-  my_mem_dest_ptr dest = (my_mem_dest_ptr) cinfo->dest;
+  my_mem_dest_ptr dest = (my_mem_dest_ptr)cinfo->dest;
 
   *dest->outbuffer = dest->buffer;
   *dest->outsize = (unsigned long)(dest->bufsize - dest->pub.free_in_buffer);
@@ -204,7 +198,7 @@ term_mem_destination (j_compress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
+jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile)
 {
   my_dest_ptr dest;
 
@@ -213,7 +207,7 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_destination_mgr));
   } else if (cinfo->dest->init_destination != init_destination) {
     /* It is unsafe to reuse the existing destination manager unless it was
@@ -225,7 +219,7 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  dest = (my_dest_ptr) cinfo->dest;
+  dest = (my_dest_ptr)cinfo->dest;
   dest->pub.init_destination = init_destination;
   dest->pub.empty_output_buffer = empty_output_buffer;
   dest->pub.term_destination = term_destination;
@@ -249,8 +243,8 @@ jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile)
  */
 
 GLOBAL(void)
-jpeg_mem_dest (j_compress_ptr cinfo,
-               unsigned char **outbuffer, unsigned long *outsize)
+jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+              unsigned long *outsize)
 {
   my_mem_dest_ptr dest;
 
@@ -262,7 +256,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
    */
   if (cinfo->dest == NULL) {    /* first time for this JPEG object? */
     cinfo->dest = (struct jpeg_destination_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_mem_destination_mgr));
   } else if (cinfo->dest->init_destination != init_mem_destination) {
     /* It is unsafe to reuse the existing destination manager unless it was
@@ -271,7 +265,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  dest = (my_mem_dest_ptr) cinfo->dest;
+  dest = (my_mem_dest_ptr)cinfo->dest;
   dest->pub.init_destination = init_mem_destination;
   dest->pub.empty_output_buffer = empty_mem_output_buffer;
   dest->pub.term_destination = term_mem_destination;
@@ -281,7 +275,7 @@ jpeg_mem_dest (j_compress_ptr cinfo,
 
   if (*outbuffer == NULL || *outsize == 0) {
     /* Allocate initial buffer */
-    dest->newbuffer = *outbuffer = (unsigned char *) malloc(OUTPUT_BUF_SIZE);
+    dest->newbuffer = *outbuffer = (unsigned char *)malloc(OUTPUT_BUF_SIZE);
     if (dest->newbuffer == NULL)
       ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
     *outsize = OUTPUT_BUF_SIZE;
diff --git a/media/libjpeg/jdatasrc.c b/media/libjpeg/jdatasrc.c
index c83183fe19..e36a30d894 100644
--- a/media/libjpeg/jdatasrc.c
+++ b/media/libjpeg/jdatasrc.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * Modified 2009-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, 2016, D. R. Commander.
+ * Copyright (C) 2013, 2016, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -45,9 +45,9 @@ typedef my_source_mgr *my_src_ptr;
  */
 
 METHODDEF(void)
-init_source (j_decompress_ptr cinfo)
+init_source(j_decompress_ptr cinfo)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  my_src_ptr src = (my_src_ptr)cinfo->src;
 
   /* We reset the empty-input-file flag for each image,
    * but we don't clear the input buffer.
@@ -58,7 +58,7 @@ init_source (j_decompress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(void)
-init_mem_source (j_decompress_ptr cinfo)
+init_mem_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -99,20 +99,20 @@ init_mem_source (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-fill_input_buffer (j_decompress_ptr cinfo)
+fill_input_buffer(j_decompress_ptr cinfo)
 {
-  my_src_ptr src = (my_src_ptr) cinfo->src;
+  my_src_ptr src = (my_src_ptr)cinfo->src;
   size_t nbytes;
 
-  nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
+  nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile);
 
   if (nbytes <= 0) {
     if (src->start_of_file)     /* Treat empty input file as fatal error */
       ERREXIT(cinfo, JERR_INPUT_EMPTY);
     WARNMS(cinfo, JWRN_JPEG_EOF);
     /* Insert a fake EOI marker */
-    src->buffer[0] = (JOCTET) 0xFF;
-    src->buffer[1] = (JOCTET) JPEG_EOI;
+    src->buffer[0] = (JOCTET)0xFF;
+    src->buffer[1] = (JOCTET)JPEG_EOI;
     nbytes = 2;
   }
 
@@ -125,10 +125,10 @@ fill_input_buffer (j_decompress_ptr cinfo)
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 METHODDEF(boolean)
-fill_mem_input_buffer (j_decompress_ptr cinfo)
+fill_mem_input_buffer(j_decompress_ptr cinfo)
 {
   static const JOCTET mybuffer[4] = {
-    (JOCTET) 0xFF, (JOCTET) JPEG_EOI, 0, 0
+    (JOCTET)0xFF, (JOCTET)JPEG_EOI, 0, 0
   };
 
   /* The whole JPEG data is expected to reside in the supplied memory
@@ -160,7 +160,7 @@ fill_mem_input_buffer (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+skip_input_data(j_decompress_ptr cinfo, long num_bytes)
 {
   struct jpeg_source_mgr *src = cinfo->src;
 
@@ -169,15 +169,15 @@ skip_input_data (j_decompress_ptr cinfo, long num_bytes)
    * any trouble anyway --- large skips are infrequent.
    */
   if (num_bytes > 0) {
-    while (num_bytes > (long) src->bytes_in_buffer) {
-      num_bytes -= (long) src->bytes_in_buffer;
-      (void) (*src->fill_input_buffer) (cinfo);
+    while (num_bytes > (long)src->bytes_in_buffer) {
+      num_bytes -= (long)src->bytes_in_buffer;
+      (void)(*src->fill_input_buffer) (cinfo);
       /* note we assume that fill_input_buffer will never return FALSE,
        * so suspension need not be handled.
        */
     }
-    src->next_input_byte += (size_t) num_bytes;
-    src->bytes_in_buffer -= (size_t) num_bytes;
+    src->next_input_byte += (size_t)num_bytes;
+    src->bytes_in_buffer -= (size_t)num_bytes;
   }
 }
 
@@ -201,7 +201,7 @@ skip_input_data (j_decompress_ptr cinfo, long num_bytes)
  */
 
 METHODDEF(void)
-term_source (j_decompress_ptr cinfo)
+term_source(j_decompress_ptr cinfo)
 {
   /* no work necessary here */
 }
@@ -214,7 +214,7 @@ term_source (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
+jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile)
 {
   my_src_ptr src;
 
@@ -225,11 +225,11 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(my_source_mgr));
-    src = (my_src_ptr) cinfo->src;
+    src = (my_src_ptr)cinfo->src;
     src->buffer = (JOCTET *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   INPUT_BUF_SIZE * sizeof(JOCTET));
   } else if (cinfo->src->init_source != init_source) {
     /* It is unsafe to reuse the existing source manager unless it was created
@@ -241,7 +241,7 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
     ERREXIT(cinfo, JERR_BUFFER_SIZE);
   }
 
-  src = (my_src_ptr) cinfo->src;
+  src = (my_src_ptr)cinfo->src;
   src->pub.init_source = init_source;
   src->pub.fill_input_buffer = fill_input_buffer;
   src->pub.skip_input_data = skip_input_data;
@@ -260,8 +260,8 @@ jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile)
  */
 
 GLOBAL(void)
-jpeg_mem_src (j_decompress_ptr cinfo,
-              const unsigned char *inbuffer, unsigned long insize)
+jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+             unsigned long insize)
 {
   struct jpeg_source_mgr *src;
 
@@ -274,7 +274,7 @@ jpeg_mem_src (j_decompress_ptr cinfo,
    */
   if (cinfo->src == NULL) {     /* first time for this JPEG object? */
     cinfo->src = (struct jpeg_source_mgr *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                   sizeof(struct jpeg_source_mgr));
   } else if (cinfo->src->init_source != init_mem_source) {
     /* It is unsafe to reuse the existing source manager unless it was created
@@ -289,7 +289,7 @@ jpeg_mem_src (j_decompress_ptr cinfo,
   src->skip_input_data = skip_input_data;
   src->resync_to_restart = jpeg_resync_to_restart; /* use default method */
   src->term_source = term_source;
-  src->bytes_in_buffer = (size_t) insize;
-  src->next_input_byte = (const JOCTET *) inbuffer;
+  src->bytes_in_buffer = (size_t)insize;
+  src->next_input_byte = (const JOCTET *)inbuffer;
 }
 #endif
diff --git a/media/libjpeg/jdcoefct.c b/media/libjpeg/jdcoefct.c
index 1a48969b83..15e6cded62 100644
--- a/media/libjpeg/jdcoefct.c
+++ b/media/libjpeg/jdcoefct.c
@@ -5,8 +5,8 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
- * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2010, 2015-2016, 2019-2020, D. R. Commander.
+ * Copyright (C) 2015, 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,16 +25,15 @@
 
 
 /* Forward declarations */
-METHODDEF(int) decompress_onepass
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_onepass(j_decompress_ptr cinfo,
+                                  JSAMPIMAGE output_buf);
 #ifdef D_MULTISCAN_FILES_SUPPORTED
-METHODDEF(int) decompress_data
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+METHODDEF(int) decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
 #endif
 #ifdef BLOCK_SMOOTHING_SUPPORTED
-LOCAL(boolean) smoothing_ok (j_decompress_ptr cinfo);
-METHODDEF(int) decompress_smooth_data
-        (j_decompress_ptr cinfo, JSAMPIMAGE output_buf);
+LOCAL(boolean) smoothing_ok(j_decompress_ptr cinfo);
+METHODDEF(int) decompress_smooth_data(j_decompress_ptr cinfo,
+                                      JSAMPIMAGE output_buf);
 #endif
 
 
@@ -43,7 +42,7 @@ METHODDEF(int) decompress_smooth_data
  */
 
 METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
 {
   cinfo->input_iMCU_row = 0;
   start_iMCU_row(cinfo);
@@ -55,10 +54,10 @@ start_input_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_output_pass (j_decompress_ptr cinfo)
+start_output_pass(j_decompress_ptr cinfo)
 {
 #ifdef BLOCK_SMOOTHING_SUPPORTED
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* If multipass, check to see whether to use block smoothing on this pass */
   if (coef->pub.coef_arrays != NULL) {
@@ -83,9 +82,9 @@ start_output_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   JDIMENSION last_MCU_col = cinfo->MCUs_per_row - 1;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
@@ -101,9 +100,11 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
          MCU_col_num++) {
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
-      jzero_far((void *) coef->MCU_buffer[0],
-                (size_t) (cinfo->blocks_in_MCU * sizeof(JBLOCK)));
-      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+      jzero_far((void *)coef->MCU_buffer[0],
+                (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
+      if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->MCU_ctr = MCU_col_num;
@@ -120,28 +121,28 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
          * incremented past them!).  Note the inner loop relies on having
          * allocated the MCU_buffer[] blocks sequentially.
          */
-        blkn = 0;                 /* index of current DCT block within MCU */
+        blkn = 0;               /* index of current DCT block within MCU */
         for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
           compptr = cinfo->cur_comp_info[ci];
           /* Don't bother to IDCT an uninteresting component. */
-          if (! compptr->component_needed) {
+          if (!compptr->component_needed) {
             blkn += compptr->MCU_blocks;
             continue;
           }
           inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-          useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                      : compptr->last_col_width;
+          useful_width = (MCU_col_num < last_MCU_col) ?
+                         compptr->MCU_width : compptr->last_col_width;
           output_ptr = output_buf[compptr->component_index] +
-            yoffset * compptr->_DCT_scaled_size;
+                       yoffset * compptr->_DCT_scaled_size;
           start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
-              compptr->MCU_sample_width;
+                      compptr->MCU_sample_width;
           for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
             if (cinfo->input_iMCU_row < last_iMCU_row ||
-                yoffset+yindex < compptr->last_row_height) {
+                yoffset + yindex < compptr->last_row_height) {
               output_col = start_col;
               for (xindex = 0; xindex < useful_width; xindex++) {
                 (*inverse_DCT) (cinfo, compptr,
-                                (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+                                (JCOEFPTR)coef->MCU_buffer[blkn + xindex],
                                 output_ptr, output_col);
                 output_col += compptr->_DCT_scaled_size;
               }
@@ -172,7 +173,7 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  */
 
 METHODDEF(int)
-dummy_consume_data (j_decompress_ptr cinfo)
+dummy_consume_data(j_decompress_ptr cinfo)
 {
   return JPEG_SUSPENDED;        /* Always indicate nothing was done */
 }
@@ -188,9 +189,9 @@ dummy_consume_data (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-consume_data (j_decompress_ptr cinfo)
+consume_data(j_decompress_ptr cinfo)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION MCU_col_num;       /* index of current MCU within row */
   int blkn, ci, xindex, yindex, yoffset;
   JDIMENSION start_col;
@@ -202,9 +203,9 @@ consume_data (j_decompress_ptr cinfo)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     compptr = cinfo->cur_comp_info[ci];
     buffer[ci] = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[compptr->component_index],
+      ((j_common_ptr)cinfo, coef->whole_image[compptr->component_index],
        cinfo->input_iMCU_row * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, TRUE);
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
     /* Note: entropy decoder expects buffer to be zeroed,
      * but this is handled automatically by the memory manager
      * because we requested a pre-zeroed array.
@@ -222,14 +223,16 @@ consume_data (j_decompress_ptr cinfo)
         compptr = cinfo->cur_comp_info[ci];
         start_col = MCU_col_num * compptr->MCU_width;
         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-          buffer_ptr = buffer[ci][yindex+yoffset] + start_col;
+          buffer_ptr = buffer[ci][yindex + yoffset] + start_col;
           for (xindex = 0; xindex < compptr->MCU_width; xindex++) {
             coef->MCU_buffer[blkn++] = buffer_ptr++;
           }
         }
       }
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       /* Try to fetch the MCU. */
-      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+      if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
         coef->MCU_ctr = MCU_col_num;
@@ -259,9 +262,9 @@ consume_data (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION block_num;
   int ci, block_row, block_rows;
@@ -276,7 +279,7 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   while (cinfo->input_scan_number < cinfo->output_scan_number ||
          (cinfo->input_scan_number == cinfo->output_scan_number &&
           cinfo->input_iMCU_row <= cinfo->output_iMCU_row)) {
-    if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+    if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
   }
 
@@ -284,19 +287,19 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Don't bother to IDCT an uninteresting component. */
-    if (! compptr->component_needed)
+    if (!compptr->component_needed)
       continue;
     /* Align the virtual buffer for this component. */
     buffer = (*cinfo->mem->access_virt_barray)
-      ((j_common_ptr) cinfo, coef->whole_image[ci],
+      ((j_common_ptr)cinfo, coef->whole_image[ci],
        cinfo->output_iMCU_row * compptr->v_samp_factor,
-       (JDIMENSION) compptr->v_samp_factor, FALSE);
+       (JDIMENSION)compptr->v_samp_factor, FALSE);
     /* Count non-dummy DCT block rows in this iMCU row. */
     if (cinfo->output_iMCU_row < last_iMCU_row)
       block_rows = compptr->v_samp_factor;
     else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
     }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
@@ -307,8 +310,8 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
       output_col = 0;
       for (block_num = cinfo->master->first_MCU_col[ci];
            block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
-        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
-                        output_ptr, output_col);
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)buffer_ptr, output_ptr,
+                        output_col);
         buffer_ptr++;
         output_col += compptr->_DCT_scaled_size;
       }
@@ -327,19 +330,22 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
 
 /*
- * This code applies interblock smoothing as described by section K.8
- * of the JPEG standard: the first 5 AC coefficients are estimated from
- * the DC values of a DCT block and its 8 neighboring blocks.
+ * This code applies interblock smoothing; the first 9 AC coefficients are
+ * estimated from the DC values of a DCT block and its 24 neighboring blocks.
  * We apply smoothing only for progressive JPEG decoding, and only if
  * the coefficients it can estimate are not yet known to full precision.
  */
 
-/* Natural-order array positions of the first 5 zigzag-order coefficients */
+/* Natural-order array positions of the first 9 zigzag-order coefficients */
 #define Q01_POS  1
 #define Q10_POS  8
 #define Q20_POS  16
 #define Q11_POS  9
 #define Q02_POS  2
+#define Q03_POS  3
+#define Q12_POS  10
+#define Q21_POS  17
+#define Q30_POS  24
 
 /*
  * Determine whether block smoothing is applicable and safe.
@@ -350,51 +356,64 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  */
 
 LOCAL(boolean)
-smoothing_ok (j_decompress_ptr cinfo)
+smoothing_ok(j_decompress_ptr cinfo)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   boolean smoothing_useful = FALSE;
   int ci, coefi;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtable;
-  int *coef_bits;
-  int *coef_bits_latch;
+  int *coef_bits, *prev_coef_bits;
+  int *coef_bits_latch, *prev_coef_bits_latch;
 
-  if (! cinfo->progressive_mode || cinfo->coef_bits == NULL)
+  if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
 
   /* Allocate latch area if not already done */
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components *
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  cinfo->num_components * 2 *
                                   (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
+  prev_coef_bits_latch =
+    &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* All components' quantization values must already be latched. */
     if ((qtable = compptr->quant_table) == NULL)
       return FALSE;
-    /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */
+    /* Verify DC & first 9 AC quantizers are nonzero to avoid zero-divide. */
     if (qtable->quantval[0] == 0 ||
         qtable->quantval[Q01_POS] == 0 ||
         qtable->quantval[Q10_POS] == 0 ||
         qtable->quantval[Q20_POS] == 0 ||
         qtable->quantval[Q11_POS] == 0 ||
-        qtable->quantval[Q02_POS] == 0)
+        qtable->quantval[Q02_POS] == 0 ||
+        qtable->quantval[Q03_POS] == 0 ||
+        qtable->quantval[Q12_POS] == 0 ||
+        qtable->quantval[Q21_POS] == 0 ||
+        qtable->quantval[Q30_POS] == 0)
       return FALSE;
     /* DC values must be at least partly known for all components. */
     coef_bits = cinfo->coef_bits[ci];
+    prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
     if (coef_bits[0] < 0)
       return FALSE;
+    coef_bits_latch[0] = coef_bits[0];
     /* Block smoothing is helpful if some AC coefficients remain inaccurate. */
-    for (coefi = 1; coefi <= 5; coefi++) {
+    for (coefi = 1; coefi < SAVED_COEFS; coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
+      else
+        prev_coef_bits_latch[coefi] = -1;
       coef_bits_latch[coefi] = coef_bits[coefi];
       if (coef_bits[coefi] != 0)
         smoothing_useful = TRUE;
     }
     coef_bits_latch += SAVED_COEFS;
+    prev_coef_bits_latch += SAVED_COEFS;
   }
 
   return smoothing_useful;
@@ -406,24 +425,27 @@ smoothing_ok (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
+decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
   JDIMENSION last_iMCU_row = cinfo->total_iMCU_rows - 1;
   JDIMENSION block_num, last_block_column;
   int ci, block_row, block_rows, access_rows;
   JBLOCKARRAY buffer;
-  JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
+  JBLOCKROW buffer_ptr, prev_prev_block_row, prev_block_row;
+  JBLOCKROW next_block_row, next_next_block_row;
   JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
-  boolean first_row, last_row;
+  boolean change_dc;
   JCOEF *workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
-  JLONG Q00,Q01,Q02,Q10,Q11,Q20, num;
-  int DC1,DC2,DC3,DC4,DC5,DC6,DC7,DC8,DC9;
+  JLONG Q00, Q01, Q02, Q03 = 0, Q10, Q11, Q12 = 0, Q20, Q21 = 0, Q30 = 0, num;
+  int DC01, DC02, DC03, DC04, DC05, DC06, DC07, DC08, DC09, DC10, DC11, DC12,
+      DC13, DC14, DC15, DC16, DC17, DC18, DC19, DC20, DC21, DC22, DC23, DC24,
+      DC25;
   int Al, pred;
 
   /* Keep a local variable to avoid looking it up more than once */
@@ -431,18 +453,18 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 
   /* Force some input to be done if we are getting ahead of the input. */
   while (cinfo->input_scan_number <= cinfo->output_scan_number &&
-         ! cinfo->inputctl->eoi_reached) {
+         !cinfo->inputctl->eoi_reached) {
     if (cinfo->input_scan_number == cinfo->output_scan_number) {
       /* If input is working on current scan, we ordinarily want it to
        * have completed the current row.  But if input scan is DC,
-       * we want it to keep one row ahead so that next block row's DC
+       * we want it to keep two rows ahead so that next two block rows' DC
        * values are up to date.
        */
-      JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0;
-      if (cinfo->input_iMCU_row > cinfo->output_iMCU_row+delta)
+      JDIMENSION delta = (cinfo->Ss == 0) ? 2 : 0;
+      if (cinfo->input_iMCU_row > cinfo->output_iMCU_row + delta)
         break;
     }
-    if ((*cinfo->inputctl->consume_input)(cinfo) == JPEG_SUSPENDED)
+    if ((*cinfo->inputctl->consume_input) (cinfo) == JPEG_SUSPENDED)
       return JPEG_SUSPENDED;
   }
 
@@ -450,37 +472,56 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Don't bother to IDCT an uninteresting component. */
-    if (! compptr->component_needed)
+    if (!compptr->component_needed)
       continue;
     /* Count non-dummy DCT block rows in this iMCU row. */
-    if (cinfo->output_iMCU_row < last_iMCU_row) {
+    if (cinfo->output_iMCU_row < last_iMCU_row - 1) {
+      block_rows = compptr->v_samp_factor;
+      access_rows = block_rows * 3; /* this and next two iMCU rows */
+    } else if (cinfo->output_iMCU_row < last_iMCU_row) {
       block_rows = compptr->v_samp_factor;
       access_rows = block_rows * 2; /* this and next iMCU row */
-      last_row = FALSE;
     } else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
-      block_rows = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+      block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
       access_rows = block_rows; /* this iMCU row only */
-      last_row = TRUE;
     }
     /* Align the virtual buffer for this component. */
-    if (cinfo->output_iMCU_row > 0) {
-      access_rows += compptr->v_samp_factor; /* prior iMCU row too */
+    if (cinfo->output_iMCU_row > 1) {
+      access_rows += 2 * compptr->v_samp_factor; /* prior two iMCU rows too */
       buffer = (*cinfo->mem->access_virt_barray)
-        ((j_common_ptr) cinfo, coef->whole_image[ci],
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
+         (cinfo->output_iMCU_row - 2) * compptr->v_samp_factor,
+         (JDIMENSION)access_rows, FALSE);
+      buffer += 2 * compptr->v_samp_factor; /* point to current iMCU row */
+    } else if (cinfo->output_iMCU_row > 0) {
+      buffer = (*cinfo->mem->access_virt_barray)
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
          (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
-         (JDIMENSION) access_rows, FALSE);
+         (JDIMENSION)access_rows, FALSE);
       buffer += compptr->v_samp_factor; /* point to current iMCU row */
-      first_row = FALSE;
     } else {
       buffer = (*cinfo->mem->access_virt_barray)
-        ((j_common_ptr) cinfo, coef->whole_image[ci],
-         (JDIMENSION) 0, (JDIMENSION) access_rows, FALSE);
-      first_row = TRUE;
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
+         (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
     }
-    /* Fetch component-dependent info */
-    coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+    /* Fetch component-dependent info.
+     * If the current scan is incomplete, then we use the component-dependent
+     * info from the previous scan.
+     */
+    if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+      coef_bits =
+        coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+    else
+      coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+
+    /* We only do DC interpolation if no AC coefficient data is available. */
+    change_dc =
+      coef_bits[1] == -1 && coef_bits[2] == -1 && coef_bits[3] == -1 &&
+      coef_bits[4] == -1 && coef_bits[5] == -1 && coef_bits[6] == -1 &&
+      coef_bits[7] == -1 && coef_bits[8] == -1 && coef_bits[9] == -1;
+
     quanttbl = compptr->quant_table;
     Q00 = quanttbl->quantval[0];
     Q01 = quanttbl->quantval[Q01_POS];
@@ -488,124 +529,268 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     Q20 = quanttbl->quantval[Q20_POS];
     Q11 = quanttbl->quantval[Q11_POS];
     Q02 = quanttbl->quantval[Q02_POS];
+    if (change_dc) {
+      Q03 = quanttbl->quantval[Q03_POS];
+      Q12 = quanttbl->quantval[Q12_POS];
+      Q21 = quanttbl->quantval[Q21_POS];
+      Q30 = quanttbl->quantval[Q30_POS];
+    }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
       buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
-      if (first_row && block_row == 0)
+
+      if (block_row > 0 || cinfo->output_iMCU_row > 0)
+        prev_block_row =
+          buffer[block_row - 1] + cinfo->master->first_MCU_col[ci];
+      else
         prev_block_row = buffer_ptr;
+
+      if (block_row > 1 || cinfo->output_iMCU_row > 1)
+        prev_prev_block_row =
+          buffer[block_row - 2] + cinfo->master->first_MCU_col[ci];
+      else
+        prev_prev_block_row = prev_block_row;
+
+      if (block_row < block_rows - 1 || cinfo->output_iMCU_row < last_iMCU_row)
+        next_block_row =
+          buffer[block_row + 1] + cinfo->master->first_MCU_col[ci];
       else
-        prev_block_row = buffer[block_row-1];
-      if (last_row && block_row == block_rows-1)
         next_block_row = buffer_ptr;
+
+      if (block_row < block_rows - 2 ||
+          cinfo->output_iMCU_row < last_iMCU_row - 1)
+        next_next_block_row =
+          buffer[block_row + 2] + cinfo->master->first_MCU_col[ci];
       else
-        next_block_row = buffer[block_row+1];
+        next_next_block_row = next_block_row;
+
       /* We fetch the surrounding DC values using a sliding-register approach.
-       * Initialize all nine here so as to do the right thing on narrow pics.
+       * Initialize all 25 here so as to do the right thing on narrow pics.
        */
-      DC1 = DC2 = DC3 = (int) prev_block_row[0][0];
-      DC4 = DC5 = DC6 = (int) buffer_ptr[0][0];
-      DC7 = DC8 = DC9 = (int) next_block_row[0][0];
+      DC01 = DC02 = DC03 = DC04 = DC05 = (int)prev_prev_block_row[0][0];
+      DC06 = DC07 = DC08 = DC09 = DC10 = (int)prev_block_row[0][0];
+      DC11 = DC12 = DC13 = DC14 = DC15 = (int)buffer_ptr[0][0];
+      DC16 = DC17 = DC18 = DC19 = DC20 = (int)next_block_row[0][0];
+      DC21 = DC22 = DC23 = DC24 = DC25 = (int)next_next_block_row[0][0];
       output_col = 0;
       last_block_column = compptr->width_in_blocks - 1;
       for (block_num = cinfo->master->first_MCU_col[ci];
            block_num <= cinfo->master->last_MCU_col[ci]; block_num++) {
         /* Fetch current DCT block into workspace so we can modify it. */
-        jcopy_block_row(buffer_ptr, (JBLOCKROW) workspace, (JDIMENSION) 1);
+        jcopy_block_row(buffer_ptr, (JBLOCKROW)workspace, (JDIMENSION)1);
         /* Update DC values */
-        if (block_num < last_block_column) {
-          DC3 = (int) prev_block_row[1][0];
-          DC6 = (int) buffer_ptr[1][0];
-          DC9 = (int) next_block_row[1][0];
+        if (block_num == cinfo->master->first_MCU_col[ci] &&
+            block_num < last_block_column) {
+          DC04 = (int)prev_prev_block_row[1][0];
+          DC09 = (int)prev_block_row[1][0];
+          DC14 = (int)buffer_ptr[1][0];
+          DC19 = (int)next_block_row[1][0];
+          DC24 = (int)next_next_block_row[1][0];
         }
-        /* Compute coefficient estimates per K.8.
-         * An estimate is applied only if coefficient is still zero,
-         * and is not known to be fully accurate.
+        if (block_num + 1 < last_block_column) {
+          DC05 = (int)prev_prev_block_row[2][0];
+          DC10 = (int)prev_block_row[2][0];
+          DC15 = (int)buffer_ptr[2][0];
+          DC20 = (int)next_block_row[2][0];
+          DC25 = (int)next_next_block_row[2][0];
+        }
+        /* If DC interpolation is enabled, compute coefficient estimates using
+         * a Gaussian-like kernel, keeping the averages of the DC values.
+         *
+         * If DC interpolation is disabled, compute coefficient estimates using
+         * an algorithm similar to the one described in Section K.8 of the JPEG
+         * standard, except applied to a 5x5 window rather than a 3x3 window.
+         *
+         * An estimate is applied only if the coefficient is still zero and is
+         * not known to be fully accurate.
          */
         /* AC01 */
-        if ((Al=coef_bits[1]) != 0 && workspace[1] == 0) {
-          num = 36 * Q00 * (DC4 - DC6);
+        if ((Al = coef_bits[1]) != 0 && workspace[1] == 0) {
+          num = Q00 * (change_dc ?
+                (-DC01 - DC02 + DC04 + DC05 - 3 * DC06 + 13 * DC07 -
+                 13 * DC09 + 3 * DC10 - 3 * DC11 + 38 * DC12 - 38 * DC14 +
+                 3 * DC15 - 3 * DC16 + 13 * DC17 - 13 * DC19 + 3 * DC20 -
+                 DC21 - DC22 + DC24 + DC25) :
+                (-7 * DC11 + 50 * DC12 - 50 * DC14 + 7 * DC15));
           if (num >= 0) {
-            pred = (int) (((Q01<<7) + num) / (Q01<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q01 << 7) + num) / (Q01 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q01<<7) - num) / (Q01<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q01 << 7) - num) / (Q01 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[1] = (JCOEF) pred;
+          workspace[1] = (JCOEF)pred;
         }
         /* AC10 */
-        if ((Al=coef_bits[2]) != 0 && workspace[8] == 0) {
-          num = 36 * Q00 * (DC2 - DC8);
+        if ((Al = coef_bits[2]) != 0 && workspace[8] == 0) {
+          num = Q00 * (change_dc ?
+                (-DC01 - 3 * DC02 - 3 * DC03 - 3 * DC04 - DC05 - DC06 +
+                 13 * DC07 + 38 * DC08 + 13 * DC09 - DC10 + DC16 -
+                 13 * DC17 - 38 * DC18 - 13 * DC19 + DC20 + DC21 +
+                 3 * DC22 + 3 * DC23 + 3 * DC24 + DC25) :
+                (-7 * DC03 + 50 * DC08 - 50 * DC18 + 7 * DC23));
           if (num >= 0) {
-            pred = (int) (((Q10<<7) + num) / (Q10<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q10 << 7) + num) / (Q10 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q10<<7) - num) / (Q10<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q10 << 7) - num) / (Q10 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[8] = (JCOEF) pred;
+          workspace[8] = (JCOEF)pred;
         }
         /* AC20 */
-        if ((Al=coef_bits[3]) != 0 && workspace[16] == 0) {
-          num = 9 * Q00 * (DC2 + DC8 - 2*DC5);
+        if ((Al = coef_bits[3]) != 0 && workspace[16] == 0) {
+          num = Q00 * (change_dc ?
+                (DC03 + 2 * DC07 + 7 * DC08 + 2 * DC09 - 5 * DC12 - 14 * DC13 -
+                 5 * DC14 + 2 * DC17 + 7 * DC18 + 2 * DC19 + DC23) :
+                (-DC03 + 13 * DC08 - 24 * DC13 + 13 * DC18 - DC23));
           if (num >= 0) {
-            pred = (int) (((Q20<<7) + num) / (Q20<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q20 << 7) + num) / (Q20 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q20<<7) - num) / (Q20<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q20 << 7) - num) / (Q20 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[16] = (JCOEF) pred;
+          workspace[16] = (JCOEF)pred;
         }
         /* AC11 */
-        if ((Al=coef_bits[4]) != 0 && workspace[9] == 0) {
-          num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
+        if ((Al = coef_bits[4]) != 0 && workspace[9] == 0) {
+          num = Q00 * (change_dc ?
+                (-DC01 + DC05 + 9 * DC07 - 9 * DC09 - 9 * DC17 +
+                 9 * DC19 + DC21 - DC25) :
+                (DC10 + DC16 - 10 * DC17 + 10 * DC19 - DC02 - DC20 + DC22 -
+                 DC24 + DC04 - DC06 + 10 * DC07 - 10 * DC09));
           if (num >= 0) {
-            pred = (int) (((Q11<<7) + num) / (Q11<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q11 << 7) + num) / (Q11 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q11<<7) - num) / (Q11<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q11 << 7) - num) / (Q11 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[9] = (JCOEF) pred;
+          workspace[9] = (JCOEF)pred;
         }
         /* AC02 */
-        if ((Al=coef_bits[5]) != 0 && workspace[2] == 0) {
-          num = 9 * Q00 * (DC4 + DC6 - 2*DC5);
+        if ((Al = coef_bits[5]) != 0 && workspace[2] == 0) {
+          num = Q00 * (change_dc ?
+                (2 * DC07 - 5 * DC08 + 2 * DC09 + DC11 + 7 * DC12 - 14 * DC13 +
+                 7 * DC14 + DC15 + 2 * DC17 - 5 * DC18 + 2 * DC19) :
+                (-DC11 + 13 * DC12 - 24 * DC13 + 13 * DC14 - DC15));
           if (num >= 0) {
-            pred = (int) (((Q02<<7) + num) / (Q02<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q02 << 7) + num) / (Q02 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
           } else {
-            pred = (int) (((Q02<<7) - num) / (Q02<<8));
-            if (Al > 0 && pred >= (1<<Al))
-              pred = (1<<Al)-1;
+            pred = (int)(((Q02 << 7) - num) / (Q02 << 8));
+            if (Al > 0 && pred >= (1 << Al))
+              pred = (1 << Al) - 1;
             pred = -pred;
           }
-          workspace[2] = (JCOEF) pred;
+          workspace[2] = (JCOEF)pred;
         }
+        if (change_dc) {
+          /* AC03 */
+          if ((Al = coef_bits[6]) != 0 && workspace[3] == 0) {
+            num = Q00 * (DC07 - DC09 + 2 * DC12 - 2 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q03 << 7) + num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q03 << 7) - num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[3] = (JCOEF)pred;
+          }
+          /* AC12 */
+          if ((Al = coef_bits[7]) != 0 && workspace[10] == 0) {
+            num = Q00 * (DC07 - 3 * DC08 + DC09 - DC17 + 3 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q12 << 7) + num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q12 << 7) - num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[10] = (JCOEF)pred;
+          }
+          /* AC21 */
+          if ((Al = coef_bits[8]) != 0 && workspace[17] == 0) {
+            num = Q00 * (DC07 - DC09 - 3 * DC12 + 3 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q21 << 7) + num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q21 << 7) - num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[17] = (JCOEF)pred;
+          }
+          /* AC30 */
+          if ((Al = coef_bits[9]) != 0 && workspace[24] == 0) {
+            num = Q00 * (DC07 + 2 * DC08 + DC09 - DC17 - 2 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q30 << 7) + num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q30 << 7) - num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[24] = (JCOEF)pred;
+          }
+          /* coef_bits[0] is non-negative.  Otherwise this function would not
+           * be called.
+           */
+          num = Q00 *
+                (-2 * DC01 - 6 * DC02 - 8 * DC03 - 6 * DC04 - 2 * DC05 -
+                 6 * DC06 + 6 * DC07 + 42 * DC08 + 6 * DC09 - 6 * DC10 -
+                 8 * DC11 + 42 * DC12 + 152 * DC13 + 42 * DC14 - 8 * DC15 -
+                 6 * DC16 + 6 * DC17 + 42 * DC18 + 6 * DC19 - 6 * DC20 -
+                 2 * DC21 - 6 * DC22 - 8 * DC23 - 6 * DC24 - 2 * DC25);
+          if (num >= 0) {
+            pred = (int)(((Q00 << 7) + num) / (Q00 << 8));
+          } else {
+            pred = (int)(((Q00 << 7) - num) / (Q00 << 8));
+            pred = -pred;
+          }
+          workspace[0] = (JCOEF)pred;
+        }  /* change_dc */
+
         /* OK, do the IDCT */
-        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR) workspace,
-                        output_ptr, output_col);
+        (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)workspace, output_ptr,
+                        output_col);
         /* Advance for next column */
-        DC1 = DC2; DC2 = DC3;
-        DC4 = DC5; DC5 = DC6;
-        DC7 = DC8; DC8 = DC9;
-        buffer_ptr++, prev_block_row++, next_block_row++;
+        DC01 = DC02;  DC02 = DC03;  DC03 = DC04;  DC04 = DC05;
+        DC06 = DC07;  DC07 = DC08;  DC08 = DC09;  DC09 = DC10;
+        DC11 = DC12;  DC12 = DC13;  DC13 = DC14;  DC14 = DC15;
+        DC16 = DC17;  DC17 = DC18;  DC18 = DC19;  DC19 = DC20;
+        DC21 = DC22;  DC22 = DC23;  DC23 = DC24;  DC24 = DC25;
+        buffer_ptr++, prev_block_row++, next_block_row++,
+          prev_prev_block_row++, next_next_block_row++;
         output_col += compptr->_DCT_scaled_size;
       }
       output_ptr += compptr->_DCT_scaled_size;
@@ -625,14 +810,14 @@ decompress_smooth_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
  */
 
 GLOBAL(void)
-jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_coef_ptr coef;
 
   coef = (my_coef_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_coef_controller));
-  cinfo->coef = (struct jpeg_d_coef_controller *) coef;
+  cinfo->coef = (struct jpeg_d_coef_controller *)coef;
   coef->pub.start_input_pass = start_input_pass;
   coef->pub.start_output_pass = start_output_pass;
 #ifdef BLOCK_SMOOTHING_SUPPORTED
@@ -654,15 +839,15 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
       /* If block smoothing could be used, need a bigger window */
       if (cinfo->progressive_mode)
-        access_rows *= 3;
+        access_rows *= 5;
 #endif
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, TRUE,
-         (JDIMENSION) jround_up((long) compptr->width_in_blocks,
-                                (long) compptr->h_samp_factor),
-         (JDIMENSION) jround_up((long) compptr->height_in_blocks,
-                                (long) compptr->v_samp_factor),
-         (JDIMENSION) access_rows);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, TRUE,
+         (JDIMENSION)jround_up((long)compptr->width_in_blocks,
+                               (long)compptr->h_samp_factor),
+         (JDIMENSION)jround_up((long)compptr->height_in_blocks,
+                               (long)compptr->v_samp_factor),
+         (JDIMENSION)access_rows);
     }
     coef->pub.consume_data = consume_data;
     coef->pub.decompress_data = decompress_data;
@@ -676,7 +861,7 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     int i;
 
     buffer = (JBLOCKROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   D_MAX_BLOCKS_IN_MCU * sizeof(JBLOCK));
     for (i = 0; i < D_MAX_BLOCKS_IN_MCU; i++) {
       coef->MCU_buffer[i] = buffer + i;
@@ -688,6 +873,6 @@ jinit_d_coef_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
 
   /* Allocate the workspace buffer */
   coef->workspace = (JCOEF *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(JCOEF) * DCTSIZE2);
 }
diff --git a/media/libjpeg/jdcoefct.h b/media/libjpeg/jdcoefct.h
index bf6beb274b..9a0e780663 100644
--- a/media/libjpeg/jdcoefct.h
+++ b/media/libjpeg/jdcoefct.h
@@ -5,6 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
@@ -51,7 +52,7 @@ typedef struct {
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   /* When doing block smoothing, we latch coefficient Al values here */
   int *coef_bits_latch;
-#define SAVED_COEFS  6          /* we save coef_bits[0..5] */
+#define SAVED_COEFS  10         /* we save coef_bits[0..9] */
 #endif
 } my_coef_controller;
 
@@ -59,10 +60,10 @@ typedef my_coef_controller *my_coef_ptr;
 
 
 LOCAL(void)
-start_iMCU_row (j_decompress_ptr cinfo)
+start_iMCU_row(j_decompress_ptr cinfo)
 /* Reset within-iMCU-row counters for a new row (input side) */
 {
-  my_coef_ptr coef = (my_coef_ptr) cinfo->coef;
+  my_coef_ptr coef = (my_coef_ptr)cinfo->coef;
 
   /* In an interleaved scan, an MCU row is the same as an iMCU row.
    * In a noninterleaved scan, an iMCU row has v_samp_factor MCU rows.
@@ -71,7 +72,7 @@ start_iMCU_row (j_decompress_ptr cinfo)
   if (cinfo->comps_in_scan > 1) {
     coef->MCU_rows_per_iMCU_row = 1;
   } else {
-    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows-1))
+    if (cinfo->input_iMCU_row < (cinfo->total_iMCU_rows - 1))
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->v_samp_factor;
     else
       coef->MCU_rows_per_iMCU_row = cinfo->cur_comp_info[0]->last_row_height;
diff --git a/media/libjpeg/jdcol565.c b/media/libjpeg/jdcol565.c
index 349fce4a66..53c7bd9187 100644
--- a/media/libjpeg/jdcol565.c
+++ b/media/libjpeg/jdcol565.c
@@ -17,22 +17,22 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
-                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-                             JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, JSAMPARRAY output_buf,
+                            int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -45,31 +45,31 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
     outptr = *output_buf++;
 
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -80,15 +80,15 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -96,22 +96,22 @@ ycc_rgb565_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
 
@@ -125,23 +125,23 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -150,9 +150,9 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -165,16 +165,16 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                      SCALEBITS)), d0)];
       b = range_limit[DITHER_565_B(y + Cbbtab[cb], d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -182,9 +182,9 @@ ycc_rgb565D_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
-                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-                             JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                            JDIMENSION input_row, JSAMPARRAY output_buf,
+                            int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
@@ -202,34 +202,34 @@ rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
       WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = GETJSAMPLE(*inptr0);
-      g = GETJSAMPLE(*inptr1);
-      b = GETJSAMPLE(*inptr2);
+      r = *inptr0;
+      g = *inptr1;
+      b = *inptr2;
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -237,14 +237,14 @@ rgb_rgb565_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   SHIFT_TEMPS
@@ -259,24 +259,24 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
@@ -284,11 +284,11 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+      r = range_limit[DITHER_565_R(*inptr0, d0)];
+      g = range_limit[DITHER_565_G(*inptr1, d0)];
+      b = range_limit[DITHER_565_B(*inptr2, d0)];
       rgb = PACK_SHORT_565(r, g, b);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -296,9 +296,9 @@ rgb_rgb565D_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-gray_rgb565_convert_internal (j_decompress_ptr cinfo,
-                              JSAMPIMAGE input_buf, JDIMENSION input_row,
-                              JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                             JDIMENSION input_row, JSAMPARRAY output_buf,
+                             int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
@@ -313,7 +313,7 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
     if (PACK_NEED_ALIGNMENT(outptr)) {
       g = *inptr++;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -328,7 +328,7 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
     if (num_cols & 1) {
       g = *inptr;
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
@@ -336,13 +336,13 @@ gray_rgb565_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf, JDIMENSION input_row,
-                               JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION input_row, JSAMPARRAY output_buf,
+                              int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
   JDIMENSION num_cols = cinfo->output_width;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
 
@@ -356,7 +356,7 @@ gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
       g = *inptr++;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
@@ -378,7 +378,7 @@ gray_rgb565D_convert_internal (j_decompress_ptr cinfo,
       g = *inptr;
       g = range_limit[DITHER_565_R(g, d0)];
       rgb = PACK_SHORT_565(g, g, g);
-      *(INT16*)outptr = (INT16)rgb;
+      *(INT16 *)outptr = (INT16)rgb;
     }
   }
 }
diff --git a/media/libjpeg/jdcolext.c b/media/libjpeg/jdcolext.c
index 59b676cc4d..863c7a2fbc 100644
--- a/media/libjpeg/jdcolext.c
+++ b/media/libjpeg/jdcolext.c
@@ -28,22 +28,22 @@
 
 INLINE
 LOCAL(void)
-ycc_rgb_convert_internal (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2;
   register JDIMENSION col;
   JDIMENSION num_cols = cinfo->output_width;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  register int * Crrtab = cconvert->Cr_r_tab;
-  register int * Cbbtab = cconvert->Cb_b_tab;
-  register JLONG * Crgtab = cconvert->Cr_g_tab;
-  register JLONG * Cbgtab = cconvert->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  register int *Crrtab = cconvert->Cr_r_tab;
+  register int *Cbbtab = cconvert->Cb_b_tab;
+  register JLONG *Crgtab = cconvert->Cr_g_tab;
+  register JLONG *Cbgtab = cconvert->Cb_g_tab;
   SHIFT_TEMPS
 
   while (--num_rows >= 0) {
@@ -53,14 +53,14 @@ ycc_rgb_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
-                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
-                                                 SCALEBITS))];
+                              ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                                                SCALEBITS))];
       outptr[RGB_BLUE] =  range_limit[y + Cbbtab[cb]];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -81,9 +81,9 @@ ycc_rgb_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-gray_rgb_convert_internal (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION input_row,
-                           JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION input_row, JSAMPARRAY output_buf,
+                          int num_rows)
 {
   register JSAMPROW inptr, outptr;
   register JDIMENSION col;
@@ -93,7 +93,6 @@ gray_rgb_convert_internal (j_decompress_ptr cinfo,
     inptr = input_buf[0][input_row++];
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -112,9 +111,9 @@ gray_rgb_convert_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-rgb_rgb_convert_internal (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
   register JSAMPROW inptr0, inptr1, inptr2;
   register JSAMPROW outptr;
@@ -128,7 +127,6 @@ rgb_rgb_convert_internal (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = inptr0[col];
       outptr[RGB_GREEN] = inptr1[col];
       outptr[RGB_BLUE] = inptr2[col];
diff --git a/media/libjpeg/jdcolor.c b/media/libjpeg/jdcolor.c
index ab8fa24925..8da2b4eaf2 100644
--- a/media/libjpeg/jdcolor.c
+++ b/media/libjpeg/jdcolor.c
@@ -74,8 +74,8 @@ typedef my_color_deconverter *my_cconvert_ptr;
  */
 
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 /* We allocate one big table for RGB->Y conversion and divide it up into
  * three parts, instead of doing three alloc_small requests.  This lets us
@@ -85,9 +85,9 @@ typedef my_color_deconverter *my_cconvert_ptr;
  */
 
 #define R_Y_OFF         0                       /* offset to R => Y section */
-#define G_Y_OFF         (1*(MAXJSAMPLE+1))      /* offset to G => Y section */
-#define B_Y_OFF         (2*(MAXJSAMPLE+1))      /* etc. */
-#define TABLE_SIZE      (3*(MAXJSAMPLE+1))
+#define G_Y_OFF         (1 * (MAXJSAMPLE + 1))  /* offset to G => Y section */
+#define B_Y_OFF         (2 * (MAXJSAMPLE + 1))  /* etc. */
+#define TABLE_SIZE      (3 * (MAXJSAMPLE + 1))
 
 
 /* Include inline routines for colorspace extensions */
@@ -98,13 +98,13 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef RGB_BLUE
 #undef RGB_PIXELSIZE
 
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extrgb_convert_internal
-#define gray_rgb_convert_internal gray_extrgb_convert_internal
-#define rgb_rgb_convert_internal rgb_extrgb_convert_internal
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extrgb_convert_internal
+#define gray_rgb_convert_internal  gray_extrgb_convert_internal
+#define rgb_rgb_convert_internal  rgb_extrgb_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -114,14 +114,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extrgbx_convert_internal
-#define gray_rgb_convert_internal gray_extrgbx_convert_internal
-#define rgb_rgb_convert_internal rgb_extrgbx_convert_internal
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extrgbx_convert_internal
+#define gray_rgb_convert_internal  gray_extrgbx_convert_internal
+#define rgb_rgb_convert_internal  rgb_extrgbx_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -132,13 +132,13 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extbgr_convert_internal
-#define gray_rgb_convert_internal gray_extbgr_convert_internal
-#define rgb_rgb_convert_internal rgb_extbgr_convert_internal
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extbgr_convert_internal
+#define gray_rgb_convert_internal  gray_extbgr_convert_internal
+#define rgb_rgb_convert_internal  rgb_extbgr_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -148,14 +148,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extbgrx_convert_internal
-#define gray_rgb_convert_internal gray_extbgrx_convert_internal
-#define rgb_rgb_convert_internal rgb_extbgrx_convert_internal
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extbgrx_convert_internal
+#define gray_rgb_convert_internal  gray_extbgrx_convert_internal
+#define rgb_rgb_convert_internal  rgb_extbgrx_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -166,14 +166,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extxbgr_convert_internal
-#define gray_rgb_convert_internal gray_extxbgr_convert_internal
-#define rgb_rgb_convert_internal rgb_extxbgr_convert_internal
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extxbgr_convert_internal
+#define gray_rgb_convert_internal  gray_extxbgr_convert_internal
+#define rgb_rgb_convert_internal  rgb_extxbgr_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -184,14 +184,14 @@ typedef my_color_deconverter *my_cconvert_ptr;
 #undef gray_rgb_convert_internal
 #undef rgb_rgb_convert_internal
 
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define ycc_rgb_convert_internal ycc_extxrgb_convert_internal
-#define gray_rgb_convert_internal gray_extxrgb_convert_internal
-#define rgb_rgb_convert_internal rgb_extxrgb_convert_internal
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define ycc_rgb_convert_internal  ycc_extxrgb_convert_internal
+#define gray_rgb_convert_internal  gray_extxrgb_convert_internal
+#define rgb_rgb_convert_internal  rgb_extxrgb_convert_internal
 #include "jdcolext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -208,25 +208,25 @@ typedef my_color_deconverter *my_cconvert_ptr;
  */
 
 LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   int i;
   JLONG x;
   SHIFT_TEMPS
 
   cconvert->Cr_r_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cb_b_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   cconvert->Cr_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
   cconvert->Cb_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -238,10 +238,10 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
     cconvert->Cb_b_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
-    cconvert->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+    cconvert->Cr_g_tab[i] = (-FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
     /* We also add in ONE_HALF so that need not do it in inner loop */
-    cconvert->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+    cconvert->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
   }
 }
 
@@ -251,43 +251,42 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-ycc_rgb_convert (j_decompress_ptr cinfo,
-                 JSAMPIMAGE input_buf, JDIMENSION input_row,
-                 JSAMPARRAY output_buf, int num_rows)
+ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    default:
-      ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    ycc_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    ycc_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    ycc_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    ycc_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    ycc_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    ycc_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  default:
+    ycc_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                             num_rows);
+    break;
   }
 }
 
@@ -300,21 +299,21 @@ ycc_rgb_convert (j_decompress_ptr cinfo,
  */
 
 LOCAL(void)
-build_rgb_y_table (j_decompress_ptr cinfo)
+build_rgb_y_table(j_decompress_ptr cinfo)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   JLONG *rgb_y_tab;
   JLONG i;
 
   /* Allocate and fill in the conversion tables. */
   cconvert->rgb_y_tab = rgb_y_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 (TABLE_SIZE * sizeof(JLONG)));
 
   for (i = 0; i <= MAXJSAMPLE; i++) {
-    rgb_y_tab[i+R_Y_OFF] = FIX(0.29900) * i;
-    rgb_y_tab[i+G_Y_OFF] = FIX(0.58700) * i;
-    rgb_y_tab[i+B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
+    rgb_y_tab[i + R_Y_OFF] = FIX(0.29900) * i;
+    rgb_y_tab[i + G_Y_OFF] = FIX(0.58700) * i;
+    rgb_y_tab[i + B_Y_OFF] = FIX(0.11400) * i + ONE_HALF;
   }
 }
 
@@ -324,11 +323,10 @@ build_rgb_y_table (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-rgb_gray_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int r, g, b;
   register JLONG *ctab = cconvert->rgb_y_tab;
   register JSAMPROW outptr;
@@ -343,13 +341,12 @@ rgb_gray_convert (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr0[col]);
-      g = GETJSAMPLE(inptr1[col]);
-      b = GETJSAMPLE(inptr2[col]);
+      r = inptr0[col];
+      g = inptr1[col];
+      b = inptr2[col];
       /* Y */
-      outptr[col] = (JSAMPLE)
-                ((ctab[r+R_Y_OFF] + ctab[g+G_Y_OFF] + ctab[b+B_Y_OFF])
-                 >> SCALEBITS);
+      outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
+                               ctab[b + B_Y_OFF]) >> SCALEBITS);
     }
   }
 }
@@ -361,9 +358,8 @@ rgb_gray_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-null_convert (j_decompress_ptr cinfo,
-              JSAMPIMAGE input_buf, JDIMENSION input_row,
-              JSAMPARRAY output_buf, int num_rows)
+null_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   register JSAMPROW inptr, inptr0, inptr1, inptr2, inptr3, outptr;
   register JDIMENSION col;
@@ -423,12 +419,11 @@ null_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-grayscale_convert (j_decompress_ptr cinfo,
-                   JSAMPIMAGE input_buf, JDIMENSION input_row,
-                   JSAMPARRAY output_buf, int num_rows)
+grayscale_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  jcopy_sample_rows(input_buf[0], (int) input_row, output_buf, 0,
-                    num_rows, cinfo->output_width);
+  jcopy_sample_rows(input_buf[0], (int)input_row, output_buf, 0, num_rows,
+                    cinfo->output_width);
 }
 
 
@@ -437,43 +432,42 @@ grayscale_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-gray_rgb_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+gray_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                 JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_BGR:
-      gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                    num_rows);
-      break;
-    default:
-      gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                num_rows);
-      break;
+  case JCS_EXT_RGB:
+    gray_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    gray_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_BGR:
+    gray_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    gray_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    gray_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    gray_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                  num_rows);
+    break;
+  default:
+    gray_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                              num_rows);
+    break;
   }
 }
 
@@ -483,43 +477,42 @@ gray_rgb_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-rgb_rgb_convert (j_decompress_ptr cinfo,
-                  JSAMPIMAGE input_buf, JDIMENSION input_row,
-                  JSAMPARRAY output_buf, int num_rows)
+rgb_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_BGR:
-      rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                  num_rows);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                                   num_rows);
-      break;
-    default:
-      rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
-                               num_rows);
-      break;
+  case JCS_EXT_RGB:
+    rgb_extrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    rgb_extrgbx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_BGR:
+    rgb_extbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                num_rows);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    rgb_extbgrx_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    rgb_extxbgr_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    rgb_extxrgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                                 num_rows);
+    break;
+  default:
+    rgb_rgb_convert_internal(cinfo, input_buf, input_row, output_buf,
+                             num_rows);
+    break;
   }
 }
 
@@ -532,11 +525,10 @@ rgb_rgb_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-ycck_cmyk_convert (j_decompress_ptr cinfo,
-                   JSAMPIMAGE input_buf, JDIMENSION input_row,
-                   JSAMPARRAY output_buf, int num_rows)
+ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                  JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
-  my_cconvert_ptr cconvert = (my_cconvert_ptr) cinfo->cconvert;
+  my_cconvert_ptr cconvert = (my_cconvert_ptr)cinfo->cconvert;
   register int y, cb, cr;
   register JSAMPROW outptr;
   register JSAMPROW inptr0, inptr1, inptr2, inptr3;
@@ -558,17 +550,17 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
       outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
-                              ((int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
+                              ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                                  SCALEBITS)))];
       outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
-      outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
+      outptr[3] = inptr3[col];
       outptr += 4;
     }
   }
@@ -579,16 +571,15 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
  * RGB565 conversion
  */
 
-#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
-                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
-                                      (((g) << 11) & 0xE000) |  \
-                                      (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b) \
+  ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+  (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
 
-#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)     (((size_t)(ptr)) & 3)
+#define PACK_NEED_ALIGNMENT(ptr)    (((size_t)(ptr)) & 3)
 
 #define WRITE_TWO_ALIGNED_PIXELS(addr, pixels)  ((*(int *)(addr)) = pixels)
 
@@ -600,7 +591,7 @@ ycck_cmyk_convert (j_decompress_ptr cinfo,
 /* Declarations for ordered dithering
  *
  * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
  */
 
 #define DITHER_MASK       0x3
@@ -616,7 +607,7 @@ static const JLONG dither_matrix[4] = {
 static INLINE boolean is_big_endian(void)
 {
   int test_value = 1;
-  if(*(char *)&test_value != 1)
+  if (*(char *)&test_value != 1)
     return TRUE;
   return FALSE;
 }
@@ -624,14 +615,14 @@ static INLINE boolean is_big_endian(void)
 
 /* Include inline routines for RGB565 conversion */
 
-#define PACK_SHORT_565 PACK_SHORT_565_LE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
-#define ycc_rgb565_convert_internal ycc_rgb565_convert_le
-#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_le
-#define rgb_rgb565_convert_internal rgb_rgb565_convert_le
-#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_le
-#define gray_rgb565_convert_internal gray_rgb565_convert_le
-#define gray_rgb565D_convert_internal gray_rgb565D_convert_le
+#define PACK_SHORT_565  PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_LE
+#define ycc_rgb565_convert_internal  ycc_rgb565_convert_le
+#define ycc_rgb565D_convert_internal  ycc_rgb565D_convert_le
+#define rgb_rgb565_convert_internal  rgb_rgb565_convert_le
+#define rgb_rgb565D_convert_internal  rgb_rgb565D_convert_le
+#define gray_rgb565_convert_internal  gray_rgb565_convert_le
+#define gray_rgb565D_convert_internal  gray_rgb565D_convert_le
 #include "jdcol565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -642,14 +633,14 @@ static INLINE boolean is_big_endian(void)
 #undef gray_rgb565_convert_internal
 #undef gray_rgb565D_convert_internal
 
-#define PACK_SHORT_565 PACK_SHORT_565_BE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
-#define ycc_rgb565_convert_internal ycc_rgb565_convert_be
-#define ycc_rgb565D_convert_internal ycc_rgb565D_convert_be
-#define rgb_rgb565_convert_internal rgb_rgb565_convert_be
-#define rgb_rgb565D_convert_internal rgb_rgb565D_convert_be
-#define gray_rgb565_convert_internal gray_rgb565_convert_be
-#define gray_rgb565D_convert_internal gray_rgb565D_convert_be
+#define PACK_SHORT_565  PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_BE
+#define ycc_rgb565_convert_internal  ycc_rgb565_convert_be
+#define ycc_rgb565D_convert_internal  ycc_rgb565D_convert_be
+#define rgb_rgb565_convert_internal  rgb_rgb565_convert_be
+#define rgb_rgb565D_convert_internal  rgb_rgb565D_convert_be
+#define gray_rgb565_convert_internal  gray_rgb565_convert_be
+#define gray_rgb565D_convert_internal  gray_rgb565D_convert_be
 #include "jdcol565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -662,9 +653,8 @@ static INLINE boolean is_big_endian(void)
 
 
 METHODDEF(void)
-ycc_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -674,9 +664,8 @@ ycc_rgb565_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-ycc_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+ycc_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     ycc_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -686,9 +675,8 @@ ycc_rgb565D_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-rgb_rgb565_convert (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION input_row,
-                    JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -698,9 +686,8 @@ rgb_rgb565_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-rgb_rgb565D_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+rgb_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     rgb_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -710,9 +697,8 @@ rgb_rgb565D_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-gray_rgb565_convert (j_decompress_ptr cinfo,
-                     JSAMPIMAGE input_buf, JDIMENSION input_row,
-                     JSAMPARRAY output_buf, int num_rows)
+gray_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -722,9 +708,8 @@ gray_rgb565_convert (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-gray_rgb565D_convert (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION input_row,
-                      JSAMPARRAY output_buf, int num_rows)
+gray_rgb565D_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION input_row, JSAMPARRAY output_buf, int num_rows)
 {
   if (is_big_endian())
     gray_rgb565D_convert_be(cinfo, input_buf, input_row, output_buf, num_rows);
@@ -738,7 +723,7 @@ gray_rgb565D_convert (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-start_pass_dcolor (j_decompress_ptr cinfo)
+start_pass_dcolor(j_decompress_ptr cinfo)
 {
   /* no work needed */
 }
@@ -749,15 +734,15 @@ start_pass_dcolor (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_color_deconverter (j_decompress_ptr cinfo)
+jinit_color_deconverter(j_decompress_ptr cinfo)
 {
   my_cconvert_ptr cconvert;
   int ci;
 
   cconvert = (my_cconvert_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_color_deconverter));
-  cinfo->cconvert = (struct jpeg_color_deconverter *) cconvert;
+  cinfo->cconvert = (struct jpeg_color_deconverter *)cconvert;
   cconvert->pub.start_pass = start_pass_dcolor;
 
   /* Make sure num_components agrees with jpeg_color_space */
@@ -843,11 +828,11 @@ jinit_color_deconverter (j_decompress_ptr cinfo)
     cinfo->out_color_components = 3;
     if (cinfo->dither_mode == JDITHER_NONE) {
       if (cinfo->jpeg_color_space == JCS_YCbCr) {
-         if (jsimd_can_ycc_rgb565())
-           cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
-         else {
-           cconvert->pub.color_convert = ycc_rgb565_convert;
-           build_ycc_rgb_table(cinfo);
+        if (jsimd_can_ycc_rgb565())
+          cconvert->pub.color_convert = jsimd_ycc_rgb565_convert;
+        else {
+          cconvert->pub.color_convert = ycc_rgb565_convert;
+          build_ycc_rgb_table(cinfo);
         }
       } else if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
         cconvert->pub.color_convert = gray_rgb565_convert;
diff --git a/media/libjpeg/jdct.h b/media/libjpeg/jdct.h
index faf8e1cf03..66d1718b77 100644
--- a/media/libjpeg/jdct.h
+++ b/media/libjpeg/jdct.h
@@ -36,7 +36,7 @@ typedef int DCTELEM;            /* 16 or 32 bits is fine */
 typedef unsigned int UDCTELEM;
 typedef unsigned long long UDCTELEM2;
 #else
-typedef short DCTELEM;  /* prefer 16 bit with SIMD for parellelism */
+typedef short DCTELEM;          /* prefer 16 bit with SIMD for parellelism */
 typedef unsigned short UDCTELEM;
 typedef unsigned int UDCTELEM2;
 #endif
@@ -63,15 +63,15 @@ typedef unsigned long long UDCTELEM2;
  * Each IDCT routine has its own ideas about the best dct_table element type.
  */
 
-typedef MULTIPLIER ISLOW_MULT_TYPE; /* short or int, whichever is faster */
+typedef MULTIPLIER ISLOW_MULT_TYPE;  /* short or int, whichever is faster */
 #if BITS_IN_JSAMPLE == 8
-typedef MULTIPLIER IFAST_MULT_TYPE; /* 16 bits is OK, use short if faster */
-#define IFAST_SCALE_BITS  2     /* fractional bits in scale factors */
+typedef MULTIPLIER IFAST_MULT_TYPE;  /* 16 bits is OK, use short if faster */
+#define IFAST_SCALE_BITS  2          /* fractional bits in scale factors */
 #else
-typedef JLONG IFAST_MULT_TYPE;  /* need 32 bits for scaled quantizers */
-#define IFAST_SCALE_BITS  13    /* fractional bits in scale factors */
+typedef JLONG IFAST_MULT_TYPE;       /* need 32 bits for scaled quantizers */
+#define IFAST_SCALE_BITS  13         /* fractional bits in scale factors */
 #endif
-typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
+typedef FAST_FLOAT FLOAT_MULT_TYPE;  /* preferred floating type */
 
 
 /*
@@ -90,64 +90,64 @@ typedef FAST_FLOAT FLOAT_MULT_TYPE; /* preferred floating type */
 
 /* Extern declarations for the forward and inverse DCT routines. */
 
-EXTERN(void) jpeg_fdct_islow (DCTELEM *data);
-EXTERN(void) jpeg_fdct_ifast (DCTELEM *data);
-EXTERN(void) jpeg_fdct_float (FAST_FLOAT *data);
+EXTERN(void) jpeg_fdct_islow(DCTELEM *data);
+EXTERN(void) jpeg_fdct_ifast(DCTELEM *data);
+EXTERN(void) jpeg_fdct_float(FAST_FLOAT *data);
 
-EXTERN(void) jpeg_idct_islow
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_ifast
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_float
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_7x7
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_6x6
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_5x5
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_4x4
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_3x3
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_2x2
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_1x1
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_9x9
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_10x10
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_11x11
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_12x12
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_13x13
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_14x14
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_15x15
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
-EXTERN(void) jpeg_idct_16x16
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_islow(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_ifast(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_float(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_7x7(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_6x6(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_5x5(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_4x4(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_3x3(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_2x2(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_1x1(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_9x9(j_decompress_ptr cinfo,
+                           jpeg_component_info *compptr, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_10x10(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_11x11(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_12x12(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_13x13(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_14x14(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_15x15(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jpeg_idct_16x16(j_decompress_ptr cinfo,
+                             jpeg_component_info *compptr, JCOEFPTR coef_block,
+                             JSAMPARRAY output_buf, JDIMENSION output_col);
 
 
 /*
@@ -160,22 +160,22 @@ EXTERN(void) jpeg_idct_16x16
  * and may differ from one module to the next.
  */
 
-#define ONE     ((JLONG) 1)
-#define CONST_SCALE (ONE << CONST_BITS)
+#define ONE          ((JLONG)1)
+#define CONST_SCALE  (ONE << CONST_BITS)
 
 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
  * Caution: some C compilers fail to reduce "FIX(constant)" at compile time,
  * thus causing a lot of useless floating-point operations at run time.
  */
 
-#define FIX(x)  ((JLONG) ((x) * CONST_SCALE + 0.5))
+#define FIX(x)  ((JLONG)((x) * CONST_SCALE + 0.5))
 
 /* Descale and correctly round a JLONG value that's scaled by N bits.
  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
  * the fudge factor is correct for either sign of X.
  */
 
-#define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
+#define DESCALE(x, n)  RIGHT_SHIFT((x) + (ONE << ((n) - 1)), n)
 
 /* Multiply a JLONG variable by a JLONG constant to yield a JLONG result.
  * This macro is used only when the two inputs will actually be no more than
@@ -187,22 +187,22 @@ EXTERN(void) jpeg_idct_16x16
  */
 
 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((INT16) (const)))
+#define MULTIPLY16C16(var, const)  (((INT16)(var)) * ((INT16)(const)))
 #endif
 #ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
-#define MULTIPLY16C16(var,const)  (((INT16) (var)) * ((JLONG) (const)))
+#define MULTIPLY16C16(var, const)  (((INT16)(var)) * ((JLONG)(const)))
 #endif
 
 #ifndef MULTIPLY16C16           /* default definition */
-#define MULTIPLY16C16(var,const)  ((var) * (const))
+#define MULTIPLY16C16(var, const)  ((var) * (const))
 #endif
 
 /* Same except both inputs are variables. */
 
 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
-#define MULTIPLY16V16(var1,var2)  (((INT16) (var1)) * ((INT16) (var2)))
+#define MULTIPLY16V16(var1, var2)  (((INT16)(var1)) * ((INT16)(var2)))
 #endif
 
 #ifndef MULTIPLY16V16           /* default definition */
-#define MULTIPLY16V16(var1,var2)  ((var1) * (var2))
+#define MULTIPLY16V16(var1, var2)  ((var1) * (var2))
 #endif
diff --git a/media/libjpeg/jddctmgr.c b/media/libjpeg/jddctmgr.c
index 3a5ba7e893..e78d7bebe2 100644
--- a/media/libjpeg/jddctmgr.c
+++ b/media/libjpeg/jddctmgr.c
@@ -6,7 +6,7 @@
  * Modified 2002-2010 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015, D. R. Commander.
+ * Copyright (C) 2010, 2015, 2022, D. R. Commander.
  * Copyright (C) 2013, MIPS Technologies, Inc., California.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -94,9 +94,9 @@ typedef union {
  */
 
 METHODDEF(void)
-start_pass (j_decompress_ptr cinfo)
+start_pass(j_decompress_ptr cinfo)
 {
-  my_idct_ptr idct = (my_idct_ptr) cinfo->idct;
+  my_idct_ptr idct = (my_idct_ptr)cinfo->idct;
   int ci, i;
   jpeg_component_info *compptr;
   int method = 0;
@@ -233,7 +233,7 @@ start_pass (j_decompress_ptr cinfo)
      * multiplier table all-zero; we'll be reading zeroes from the
      * coefficient controller's buffer anyway.
      */
-    if (! compptr->component_needed || idct->cur_method[ci] == method)
+    if (!compptr->component_needed || idct->cur_method[ci] == method)
       continue;
     qtbl = compptr->quant_table;
     if (qtbl == NULL)           /* happens if no data yet for component */
@@ -246,9 +246,9 @@ start_pass (j_decompress_ptr cinfo)
         /* For LL&M IDCT method, multipliers are equal to raw quantization
          * coefficients, but are stored as ints to ensure access efficiency.
          */
-        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *) compptr->dct_table;
+        ISLOW_MULT_TYPE *ismtbl = (ISLOW_MULT_TYPE *)compptr->dct_table;
         for (i = 0; i < DCTSIZE2; i++) {
-          ismtbl[i] = (ISLOW_MULT_TYPE) qtbl->quantval[i];
+          ismtbl[i] = (ISLOW_MULT_TYPE)qtbl->quantval[i];
         }
       }
       break;
@@ -263,8 +263,8 @@ start_pass (j_decompress_ptr cinfo)
          * For integer operation, the multiplier table is to be scaled by
          * IFAST_SCALE_BITS.
          */
-        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
-#define CONST_BITS 14
+        IFAST_MULT_TYPE *ifmtbl = (IFAST_MULT_TYPE *)compptr->dct_table;
+#define CONST_BITS  14
         static const INT16 aanscales[DCTSIZE2] = {
           /* precomputed values scaled up by 14 bits */
           16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
@@ -280,9 +280,9 @@ start_pass (j_decompress_ptr cinfo)
 
         for (i = 0; i < DCTSIZE2; i++) {
           ifmtbl[i] = (IFAST_MULT_TYPE)
-            DESCALE(MULTIPLY16V16((JLONG) qtbl->quantval[i],
-                                  (JLONG) aanscales[i]),
-                    CONST_BITS-IFAST_SCALE_BITS);
+            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
+                                  (JLONG)aanscales[i]),
+                    CONST_BITS - IFAST_SCALE_BITS);
         }
       }
       break;
@@ -295,7 +295,7 @@ start_pass (j_decompress_ptr cinfo)
          *   scalefactor[0] = 1
          *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
          */
-        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *) compptr->dct_table;
+        FLOAT_MULT_TYPE *fmtbl = (FLOAT_MULT_TYPE *)compptr->dct_table;
         int row, col;
         static const double aanscalefactor[DCTSIZE] = {
           1.0, 1.387039845, 1.306562965, 1.175875602,
@@ -306,7 +306,7 @@ start_pass (j_decompress_ptr cinfo)
         for (row = 0; row < DCTSIZE; row++) {
           for (col = 0; col < DCTSIZE; col++) {
             fmtbl[i] = (FLOAT_MULT_TYPE)
-              ((double) qtbl->quantval[i] *
+              ((double)qtbl->quantval[i] *
                aanscalefactor[row] * aanscalefactor[col]);
             i++;
           }
@@ -327,25 +327,25 @@ start_pass (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_inverse_dct (j_decompress_ptr cinfo)
+jinit_inverse_dct(j_decompress_ptr cinfo)
 {
   my_idct_ptr idct;
   int ci;
   jpeg_component_info *compptr;
 
   idct = (my_idct_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_idct_controller));
-  cinfo->idct = (struct jpeg_inverse_dct *) idct;
+  cinfo->idct = (struct jpeg_inverse_dct *)idct;
   idct->pub.start_pass = start_pass;
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* Allocate and pre-zero a multiplier table for each component */
     compptr->dct_table =
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(multiplier_table));
-    MEMZERO(compptr->dct_table, sizeof(multiplier_table));
+    memset(compptr->dct_table, 0, sizeof(multiplier_table));
     /* Mark multiplier table not yet set up for any method */
     idct->cur_method[ci] = -1;
   }
diff --git a/media/libjpeg/jdhuff.c b/media/libjpeg/jdhuff.c
index fa78e2962a..679d221685 100644
--- a/media/libjpeg/jdhuff.c
+++ b/media/libjpeg/jdhuff.c
@@ -4,7 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,6 +16,9 @@
  * up to the start of the current MCU.  To do this, we copy state variables
  * into local working storage, and update them back to the permanent
  * storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
@@ -36,24 +40,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -88,9 +74,9 @@ typedef huff_entropy_decoder *huff_entropy_ptr;
  */
 
 METHODDEF(void)
-start_pass_huff_decoder (j_decompress_ptr cinfo)
+start_pass_huff_decoder(j_decompress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci, blkn, dctbl, actbl;
   d_derived_tbl **pdtbl;
   jpeg_component_info *compptr;
@@ -99,7 +85,7 @@ start_pass_huff_decoder (j_decompress_ptr cinfo)
    * This ought to be an error condition, but we make it a warning because
    * there are some baseline files out there with all zeroes in these bytes.
    */
-  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2-1 ||
+  if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
       cinfo->Ah != 0 || cinfo->Al != 0)
     WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
 
@@ -152,8 +138,8 @@ start_pass_huff_decoder (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
-                         d_derived_tbl **pdtbl)
+jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC, int tblno,
+                        d_derived_tbl **pdtbl)
 {
   JHUFF_TBL *htbl;
   d_derived_tbl *dtbl;
@@ -178,7 +164,7 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
   /* Allocate a workspace if we haven't already done so. */
   if (*pdtbl == NULL)
     *pdtbl = (d_derived_tbl *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(d_derived_tbl));
   dtbl = *pdtbl;
   dtbl->pub = htbl;             /* fill in back link */
@@ -187,11 +173,11 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 
   p = 0;
   for (l = 1; l <= 16; l++) {
-    i = (int) htbl->bits[l];
+    i = (int)htbl->bits[l];
     if (i < 0 || p + i > 256)   /* protect against table overrun */
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     while (i--)
-      huffsize[p++] = (char) l;
+      huffsize[p++] = (char)l;
   }
   huffsize[p] = 0;
   numsymbols = p;
@@ -203,14 +189,14 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
   si = huffsize[0];
   p = 0;
   while (huffsize[p]) {
-    while (((int) huffsize[p]) == si) {
+    while (((int)huffsize[p]) == si) {
       huffcode[p++] = code;
       code++;
     }
     /* code is now 1 more than the last code used for codelength si; but
      * it must still fit in si bits, since no code is allowed to be all ones.
      */
-    if (((JLONG) code) >= (((JLONG) 1) << si))
+    if (((JLONG)code) >= (((JLONG)1) << si))
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
     code <<= 1;
     si++;
@@ -224,9 +210,9 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
       /* valoffset[l] = huffval[] index of 1st symbol of code length l,
        * minus the minimum code of length l
        */
-      dtbl->valoffset[l] = (JLONG) p - (JLONG) huffcode[p];
+      dtbl->valoffset[l] = (JLONG)p - (JLONG)huffcode[p];
       p += htbl->bits[l];
-      dtbl->maxcode[l] = huffcode[p-1]; /* maximum code of length l */
+      dtbl->maxcode[l] = huffcode[p - 1]; /* maximum code of length l */
     } else {
       dtbl->maxcode[l] = -1;    /* -1 if no codes of this length */
     }
@@ -241,16 +227,16 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
    * with that code.
    */
 
-   for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
-     dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
+  for (i = 0; i < (1 << HUFF_LOOKAHEAD); i++)
+    dtbl->lookup[i] = (HUFF_LOOKAHEAD + 1) << HUFF_LOOKAHEAD;
 
   p = 0;
   for (l = 1; l <= HUFF_LOOKAHEAD; l++) {
-    for (i = 1; i <= (int) htbl->bits[l]; i++, p++) {
+    for (i = 1; i <= (int)htbl->bits[l]; i++, p++) {
       /* l = current code's length, p = its index in huffcode[] & huffval[]. */
       /* Generate left-justified code followed by all possible bit sequences */
-      lookbits = huffcode[p] << (HUFF_LOOKAHEAD-l);
-      for (ctr = 1 << (HUFF_LOOKAHEAD-l); ctr > 0; ctr--) {
+      lookbits = huffcode[p] << (HUFF_LOOKAHEAD - l);
+      for (ctr = 1 << (HUFF_LOOKAHEAD - l); ctr > 0; ctr--) {
         dtbl->lookup[lookbits] = (l << HUFF_LOOKAHEAD) | htbl->huffval[p];
         lookbits++;
       }
@@ -291,14 +277,14 @@ jpeg_make_d_derived_tbl (j_decompress_ptr cinfo, boolean isDC, int tblno,
 #ifdef SLOW_SHIFT_32
 #define MIN_GET_BITS  15        /* minimum allowable value */
 #else
-#define MIN_GET_BITS  (BIT_BUF_SIZE-7)
+#define MIN_GET_BITS  (BIT_BUF_SIZE - 7)
 #endif
 
 
 GLOBAL(boolean)
-jpeg_fill_bit_buffer (bitread_working_state *state,
-                      register bit_buf_type get_buffer, register int bits_left,
-                      int nbits)
+jpeg_fill_bit_buffer(bitread_working_state *state,
+                     register bit_buf_type get_buffer, register int bits_left,
+                     int nbits)
 /* Load up the bit buffer to a depth of at least nbits */
 {
   /* Copy heavily used state fields into locals (hopefully registers) */
@@ -316,13 +302,13 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
 
       /* Attempt to read a byte */
       if (bytes_in_buffer == 0) {
-        if (! (*cinfo->src->fill_input_buffer) (cinfo))
+        if (!(*cinfo->src->fill_input_buffer) (cinfo))
           return FALSE;
         next_input_byte = cinfo->src->next_input_byte;
         bytes_in_buffer = cinfo->src->bytes_in_buffer;
       }
       bytes_in_buffer--;
-      c = GETJOCTET(*next_input_byte++);
+      c = *next_input_byte++;
 
       /* If it's 0xFF, check and discard stuffed zero byte */
       if (c == 0xFF) {
@@ -333,13 +319,13 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
          */
         do {
           if (bytes_in_buffer == 0) {
-            if (! (*cinfo->src->fill_input_buffer) (cinfo))
+            if (!(*cinfo->src->fill_input_buffer) (cinfo))
               return FALSE;
             next_input_byte = cinfo->src->next_input_byte;
             bytes_in_buffer = cinfo->src->bytes_in_buffer;
           }
           bytes_in_buffer--;
-          c = GETJOCTET(*next_input_byte++);
+          c = *next_input_byte++;
         } while (c == 0xFF);
 
         if (c == 0) {
@@ -365,7 +351,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
       bits_left += 8;
     } /* end while */
   } else {
-  no_more_bytes:
+no_more_bytes:
     /* We get here if we've read the marker that terminates the compressed
      * data segment.  There should be enough bits in the buffer register
      * to satisfy the request; if so, no problem.
@@ -376,7 +362,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
        * We use a nonvolatile flag to ensure that only one warning message
        * appears per data segment.
        */
-      if (! cinfo->entropy->insufficient_data) {
+      if (!cinfo->entropy->insufficient_data) {
         WARNMS(cinfo, JWRN_HIT_MARKER);
         cinfo->entropy->insufficient_data = TRUE;
       }
@@ -400,11 +386,10 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
    handle markers.  We have to hand off any blocks with markers to the
    slower routines. */
 
-#define GET_BYTE \
-{ \
+#define GET_BYTE { \
   register int c0, c1; \
-  c0 = GETJOCTET(*buffer++); \
-  c1 = GETJOCTET(*buffer); \
+  c0 = *buffer++; \
+  c1 = *buffer; \
   /* Pre-execute most common case */ \
   get_buffer = (get_buffer << 8) | c0; \
   bits_left += 8; \
@@ -421,7 +406,7 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
   } \
 }
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
@@ -446,9 +431,9 @@ jpeg_fill_bit_buffer (bitread_working_state *state,
  */
 
 GLOBAL(int)
-jpeg_huff_decode (bitread_working_state *state,
-                  register bit_buf_type get_buffer, register int bits_left,
-                  d_derived_tbl *htbl, int min_bits)
+jpeg_huff_decode(bitread_working_state *state,
+                 register bit_buf_type get_buffer, register int bits_left,
+                 d_derived_tbl *htbl, int min_bits)
 {
   register int l = min_bits;
   register JLONG code;
@@ -460,7 +445,7 @@ jpeg_huff_decode (bitread_working_state *state,
   code = GET_BITS(l);
 
   /* Collect the rest of the Huffman code one bit at a time. */
-  /* This is per Figure F.16 in the JPEG spec. */
+  /* This is per Figure F.16. */
 
   while (code > htbl->maxcode[l]) {
     code <<= 1;
@@ -480,7 +465,7 @@ jpeg_huff_decode (bitread_working_state *state,
     return 0;                   /* fake a zero as the safest result */
   }
 
-  return htbl->pub->huffval[ (int) (code + htbl->valoffset[l]) ];
+  return htbl->pub->huffval[(int)(code + htbl->valoffset[l])];
 }
 
 
@@ -492,22 +477,26 @@ jpeg_huff_decode (bitread_working_state *state,
 #define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define NEG_1 ((unsigned int)-1)
-#define HUFF_EXTEND(x,s)  ((x) + ((((x) - (1<<((s)-1))) >> 31) & (((NEG_1)<<(s)) + 1)))
+#define NEG_1  ((unsigned int)-1)
+#define HUFF_EXTEND(x, s) \
+  ((x) + ((((x) - (1 << ((s) - 1))) >> 31) & (((NEG_1) << (s)) + 1)))
 
 #else
 
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
 
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = {   /* entry n is 2**(n-1) */
+  0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+  0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
 
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+  0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+  ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+  ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+  ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
 
 #endif /* AVOID_TABLES */
 
@@ -518,9 +507,9 @@ static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
  */
 
 LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int ci;
 
   /* Throw away any unused bits remaining in bit buffer; */
@@ -529,7 +518,7 @@ process_restart (j_decompress_ptr cinfo)
   entropy->bitstate.bits_left = 0;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     return FALSE;
 
   /* Re-initialize DC predictions to 0 */
@@ -551,18 +540,24 @@ process_restart (j_decompress_ptr cinfo)
 }
 
 
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+               no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
 LOCAL(boolean)
-decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   BITREAD_STATE_VARS;
   int blkn;
   savable_state state;
   /* Outer loop handles each block in the MCU */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-  ASSIGN_STATE(state, entropy->saved);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -583,11 +578,19 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->dc_needed[blkn]) {
       /* Convert DC difference to actual value, update last_dc_val */
       int ci = cinfo->MCU_membership[blkn];
+      /* Certain malformed JPEG images produce repeated DC coefficient
+       * differences of 2047 or -2047, which causes state.last_dc_val[ci] to
+       * grow until it overflows or underflows a 32-bit signed integer.  This
+       * behavior is, to the best of our understanding, innocuous, and it is
+       * unclear how to work around it without potentially affecting
+       * performance.  Thus, we (hopefully temporarily) suppress UBSan integer
+       * overflow errors for this function and decode_mcu_fast().
+       */
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block) {
         /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
-        (*block)[0] = (JCOEF) s;
+        (*block)[0] = (JCOEF)s;
       }
     }
 
@@ -610,7 +613,7 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
            * Note: the extra entries in jpeg_natural_order[] will save us
            * if k >= DCTSIZE2, which could happen if the data is corrupted.
            */
-          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+          (*block)[jpeg_natural_order[k]] = (JCOEF)s;
         } else {
           if (r != 15)
             break;
@@ -642,16 +645,22 @@ decode_mcu_slow (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+  entropy->saved = state;
   return TRUE;
 }
 
 
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+               no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
 LOCAL(boolean)
-decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   BITREAD_STATE_VARS;
   JOCTET *buffer;
   int blkn;
@@ -659,9 +668,9 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Outer loop handles each block in the MCU */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-  buffer = (JOCTET *) br_state.next_input_byte;
-  ASSIGN_STATE(state, entropy->saved);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+  buffer = (JOCTET *)br_state.next_input_byte;
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -669,7 +678,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     d_derived_tbl *actbl = entropy->ac_cur_tbls[blkn];
     register int s, k, r, l;
 
-    HUFF_DECODE_FAST(s, l, dctbl, slow_decode_mcu);
+    HUFF_DECODE_FAST(s, l, dctbl);
     if (s) {
       FILL_BIT_BUFFER_FAST
       r = GET_BITS(s);
@@ -678,16 +687,19 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     if (entropy->dc_needed[blkn]) {
       int ci = cinfo->MCU_membership[blkn];
+      /* Refer to the comment in decode_mcu_slow() regarding the supression of
+       * a UBSan integer overflow error in this line of code.
+       */
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block)
-        (*block)[0] = (JCOEF) s;
+        (*block)[0] = (JCOEF)s;
     }
 
     if (entropy->ac_needed[blkn] && block) {
 
       for (k = 1; k < DCTSIZE2; k++) {
-        HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
+        HUFF_DECODE_FAST(s, l, actbl);
         r = s >> 4;
         s &= 15;
 
@@ -696,7 +708,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
           FILL_BIT_BUFFER_FAST
           r = GET_BITS(s);
           s = HUFF_EXTEND(r, s);
-          (*block)[jpeg_natural_order[k]] = (JCOEF) s;
+          (*block)[jpeg_natural_order[k]] = (JCOEF)s;
         } else {
           if (r != 15) break;
           k += 15;
@@ -706,7 +718,7 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     } else {
 
       for (k = 1; k < DCTSIZE2; k++) {
-        HUFF_DECODE_FAST(s, l, actbl, slow_decode_mcu);
+        HUFF_DECODE_FAST(s, l, actbl);
         r = s >> 4;
         s &= 15;
 
@@ -723,15 +735,14 @@ decode_mcu_fast (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   if (cinfo->unread_marker != 0) {
-slow_decode_mcu:
     cinfo->unread_marker = 0;
     return FALSE;
   }
 
   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
   br_state.next_input_byte = buffer;
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -751,43 +762,43 @@ slow_decode_mcu:
  * this module, since we'll just re-assign them on the next call.)
  */
 
-#define BUFSIZE (DCTSIZE2 * 8)
+#define BUFSIZE  (DCTSIZE2 * 8)
 
 METHODDEF(boolean)
-decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+  huff_entropy_ptr entropy = (huff_entropy_ptr)cinfo->entropy;
   int usefast = 1;
 
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
     usefast = 0;
   }
 
-  if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU
-    || cinfo->unread_marker != 0)
+  if (cinfo->src->bytes_in_buffer < BUFSIZE * (size_t)cinfo->blocks_in_MCU ||
+      cinfo->unread_marker != 0)
     usefast = 0;
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     if (usefast) {
       if (!decode_mcu_fast(cinfo, MCU_data)) goto use_slow;
-    }
-    else {
-      use_slow:
+    } else {
+use_slow:
       if (!decode_mcu_slow(cinfo, MCU_data)) return FALSE;
     }
 
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -798,7 +809,7 @@ decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 GLOBAL(void)
-jinit_huff_decoder (j_decompress_ptr cinfo)
+jinit_huff_decoder(j_decompress_ptr cinfo)
 {
   huff_entropy_ptr entropy;
   int i;
@@ -807,12 +818,12 @@ jinit_huff_decoder (j_decompress_ptr cinfo)
      are the default tables.  Thus, if the tables are not set by the time
      the Huffman decoder is initialized (usually within the body of
      jpeg_start_decompress()), we set them to default values. */
-  std_huff_tables((j_common_ptr) cinfo);
+  std_huff_tables((j_common_ptr)cinfo);
 
   entropy = (huff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(huff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass_huff_decoder;
   entropy->pub.decode_mcu = decode_mcu;
 
diff --git a/media/libjpeg/jdhuff.h b/media/libjpeg/jdhuff.h
index 3f15d71a81..cfa0b7f558 100644
--- a/media/libjpeg/jdhuff.h
+++ b/media/libjpeg/jdhuff.h
@@ -4,7 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010-2011, 2015-2016, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -43,13 +44,12 @@ typedef struct {
    * if too long.  The next 8 bits of each entry contain the
    * symbol.
    */
-  int lookup[1<<HUFF_LOOKAHEAD];
+  int lookup[1 << HUFF_LOOKAHEAD];
 } d_derived_tbl;
 
 /* Expand a Huffman table definition into the derived format */
-EXTERN(void) jpeg_make_d_derived_tbl
-        (j_decompress_ptr cinfo, boolean isDC, int tblno,
-         d_derived_tbl ** pdtbl);
+EXTERN(void) jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC,
+                                     int tblno, d_derived_tbl **pdtbl);
 
 
 /*
@@ -74,11 +74,16 @@ EXTERN(void) jpeg_make_d_derived_tbl
 #error Cannot determine word size
 #endif
 
-#if SIZEOF_SIZE_T==8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
 
 typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64                 /* size of buffer in bits */
+
 #else
 
 typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
@@ -113,23 +118,23 @@ typedef struct {                /* Bitreading working state within an MCU */
 } bitread_working_state;
 
 /* Macros to declare and load/save bitread local variables. */
-#define BITREAD_STATE_VARS  \
-        register bit_buf_type get_buffer;  \
-        register int bits_left;  \
-        bitread_working_state br_state
+#define BITREAD_STATE_VARS \
+  register bit_buf_type get_buffer; \
+  register int bits_left; \
+  bitread_working_state br_state
 
-#define BITREAD_LOAD_STATE(cinfop,permstate)  \
-        br_state.cinfo = cinfop; \
-        br_state.next_input_byte = cinfop->src->next_input_byte; \
-        br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
-        get_buffer = permstate.get_buffer; \
-        bits_left = permstate.bits_left;
+#define BITREAD_LOAD_STATE(cinfop, permstate) \
+  br_state.cinfo = cinfop; \
+  br_state.next_input_byte = cinfop->src->next_input_byte; \
+  br_state.bytes_in_buffer = cinfop->src->bytes_in_buffer; \
+  get_buffer = permstate.get_buffer; \
+  bits_left = permstate.bits_left;
 
-#define BITREAD_SAVE_STATE(cinfop,permstate)  \
-        cinfop->src->next_input_byte = br_state.next_input_byte; \
-        cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
-        permstate.get_buffer = get_buffer; \
-        permstate.bits_left = bits_left
+#define BITREAD_SAVE_STATE(cinfop, permstate) \
+  cinfop->src->next_input_byte = br_state.next_input_byte; \
+  cinfop->src->bytes_in_buffer = br_state.bytes_in_buffer; \
+  permstate.get_buffer = get_buffer; \
+  permstate.bits_left = bits_left
 
 /*
  * These macros provide the in-line portion of bit fetching.
@@ -137,7 +142,7 @@ typedef struct {                /* Bitreading working state within an MCU */
  * before using GET_BITS, PEEK_BITS, or DROP_BITS.
  * The variables get_buffer and bits_left are assumed to be locals,
  * but the state struct might not be (jpeg_huff_decode needs this).
- *      CHECK_BIT_BUFFER(state,n,action);
+ *      CHECK_BIT_BUFFER(state, n, action);
  *              Ensure there are N bits in get_buffer; if suspend, take action.
  *      val = GET_BITS(n);
  *              Fetch next N bits.
@@ -149,25 +154,27 @@ typedef struct {                /* Bitreading working state within an MCU */
  * is evaluated multiple times.
  */
 
-#define CHECK_BIT_BUFFER(state,nbits,action) \
-        { if (bits_left < (nbits)) {  \
-            if (! jpeg_fill_bit_buffer(&(state),get_buffer,bits_left,nbits))  \
-              { action; }  \
-            get_buffer = (state).get_buffer; bits_left = (state).bits_left; } }
+#define CHECK_BIT_BUFFER(state, nbits, action) { \
+  if (bits_left < (nbits)) { \
+    if (!jpeg_fill_bit_buffer(&(state), get_buffer, bits_left, nbits)) \
+      { action; } \
+    get_buffer = (state).get_buffer;  bits_left = (state).bits_left; \
+  } \
+}
 
 #define GET_BITS(nbits) \
-        (((int) (get_buffer >> (bits_left -= (nbits)))) & ((1<<(nbits))-1))
+  (((int)(get_buffer >> (bits_left -= (nbits)))) & ((1 << (nbits)) - 1))
 
 #define PEEK_BITS(nbits) \
-        (((int) (get_buffer >> (bits_left -  (nbits)))) & ((1<<(nbits))-1))
+  (((int)(get_buffer >> (bits_left -  (nbits)))) & ((1 << (nbits)) - 1))
 
 #define DROP_BITS(nbits) \
-        (bits_left -= (nbits))
+  (bits_left -= (nbits))
 
 /* Load up the bit buffer to a depth of at least nbits */
-EXTERN(boolean) jpeg_fill_bit_buffer
-        (bitread_working_state *state, register bit_buf_type get_buffer,
-         register int bits_left, int nbits);
+EXTERN(boolean) jpeg_fill_bit_buffer(bitread_working_state *state,
+                                     register bit_buf_type get_buffer,
+                                     register int bits_left, int nbits);
 
 
 /*
@@ -187,13 +194,14 @@ EXTERN(boolean) jpeg_fill_bit_buffer
  * 3. jpeg_huff_decode returns -1 if forced to suspend.
  */
 
-#define HUFF_DECODE(result,state,htbl,failaction,slowlabel) \
-{ register int nb, look; \
+#define HUFF_DECODE(result, state, htbl, failaction, slowlabel) { \
+  register int nb, look; \
   if (bits_left < HUFF_LOOKAHEAD) { \
-    if (! jpeg_fill_bit_buffer(&state,get_buffer,bits_left, 0)) {failaction;} \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if (!jpeg_fill_bit_buffer(&state, get_buffer, bits_left, 0)) \
+      { failaction; } \
+    get_buffer = state.get_buffer;  bits_left = state.bits_left; \
     if (bits_left < HUFF_LOOKAHEAD) { \
-      nb = 1; goto slowlabel; \
+      nb = 1;  goto slowlabel; \
     } \
   } \
   look = PEEK_BITS(HUFF_LOOKAHEAD); \
@@ -202,13 +210,14 @@ EXTERN(boolean) jpeg_fill_bit_buffer
     result = htbl->lookup[look] & ((1 << HUFF_LOOKAHEAD) - 1); \
   } else { \
 slowlabel: \
-    if ((result=jpeg_huff_decode(&state,get_buffer,bits_left,htbl,nb)) < 0) \
-        { failaction; } \
-    get_buffer = state.get_buffer; bits_left = state.bits_left; \
+    if ((result = \
+         jpeg_huff_decode(&state, get_buffer, bits_left, htbl, nb)) < 0) \
+      { failaction; } \
+    get_buffer = state.get_buffer;  bits_left = state.bits_left; \
   } \
 }
 
-#define HUFF_DECODE_FAST(s,nb,htbl,slowlabel) \
+#define HUFF_DECODE_FAST(s, nb, htbl) \
   FILL_BIT_BUFFER_FAST; \
   s = PEEK_BITS(HUFF_LOOKAHEAD); \
   s = htbl->lookup[s]; \
@@ -226,11 +235,13 @@ slowlabel: \
       nb++; \
     } \
     if (nb > 16) \
-      goto slowlabel; \
-    s = htbl->pub->huffval[ (int) (s + htbl->valoffset[nb]) ]; \
+      s = 0; \
+    else \
+      s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
   }
 
 /* Out-of-line case for Huffman code fetching */
-EXTERN(int) jpeg_huff_decode
-        (bitread_working_state *state, register bit_buf_type get_buffer,
-         register int bits_left, d_derived_tbl *htbl, int min_bits);
+EXTERN(int) jpeg_huff_decode(bitread_working_state *state,
+                             register bit_buf_type get_buffer,
+                             register int bits_left, d_derived_tbl *htbl,
+                             int min_bits);
diff --git a/media/libjpeg/jdicc.c b/media/libjpeg/jdicc.c
new file mode 100644
index 0000000000..50aa9a9676
--- /dev/null
+++ b/media/libjpeg/jdicc.c
@@ -0,0 +1,167 @@
+/*
+ * jdicc.c
+ *
+ * Copyright (C) 1997-1998, Thomas G. Lane, Todd Newman.
+ * Copyright (C) 2017, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file provides code to read International Color Consortium (ICC) device
+ * profiles embedded in JFIF JPEG image files.  The ICC has defined a standard
+ * for including such data in JPEG "APP2" markers.  The code given here does
+ * not know anything about the internal structure of the ICC profile data; it
+ * just knows how to get the profile data from a JPEG file while reading it.
+ */
+
+#define JPEG_INTERNALS
+#include "jinclude.h"
+#include "jpeglib.h"
+#include "jerror.h"
+
+
+#define ICC_MARKER  (JPEG_APP0 + 2)     /* JPEG marker code for ICC */
+#define ICC_OVERHEAD_LEN  14            /* size of non-profile data in APP2 */
+
+
+/*
+ * Handy subroutine to test whether a saved marker is an ICC profile marker.
+ */
+
+LOCAL(boolean)
+marker_is_icc(jpeg_saved_marker_ptr marker)
+{
+  return
+    marker->marker == ICC_MARKER &&
+    marker->data_length >= ICC_OVERHEAD_LEN &&
+    /* verify the identifying string */
+    marker->data[0] == 0x49 &&
+    marker->data[1] == 0x43 &&
+    marker->data[2] == 0x43 &&
+    marker->data[3] == 0x5F &&
+    marker->data[4] == 0x50 &&
+    marker->data[5] == 0x52 &&
+    marker->data[6] == 0x4F &&
+    marker->data[7] == 0x46 &&
+    marker->data[8] == 0x49 &&
+    marker->data[9] == 0x4C &&
+    marker->data[10] == 0x45 &&
+    marker->data[11] == 0x0;
+}
+
+
+/*
+ * See if there was an ICC profile in the JPEG file being read; if so,
+ * reassemble and return the profile data.
+ *
+ * TRUE is returned if an ICC profile was found, FALSE if not.  If TRUE is
+ * returned, *icc_data_ptr is set to point to the returned data, and
+ * *icc_data_len is set to its length.
+ *
+ * IMPORTANT: the data at *icc_data_ptr is allocated with malloc() and must be
+ * freed by the caller with free() when the caller no longer needs it.
+ * (Alternatively, we could write this routine to use the IJG library's memory
+ * allocator, so that the data would be freed implicitly when
+ * jpeg_finish_decompress() is called.  But it seems likely that many
+ * applications will prefer to have the data stick around after decompression
+ * finishes.)
+ */
+
+GLOBAL(boolean)
+jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                      unsigned int *icc_data_len)
+{
+  jpeg_saved_marker_ptr marker;
+  int num_markers = 0;
+  int seq_no;
+  JOCTET *icc_data;
+  unsigned int total_length;
+#define MAX_SEQ_NO  255         /* sufficient since marker numbers are bytes */
+  char marker_present[MAX_SEQ_NO + 1];      /* 1 if marker found */
+  unsigned int data_length[MAX_SEQ_NO + 1]; /* size of profile data in marker */
+  unsigned int data_offset[MAX_SEQ_NO + 1]; /* offset for data in marker */
+
+  if (icc_data_ptr == NULL || icc_data_len == NULL)
+    ERREXIT(cinfo, JERR_BUFFER_SIZE);
+  if (cinfo->global_state < DSTATE_READY)
+    ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
+
+  *icc_data_ptr = NULL;         /* avoid confusion if FALSE return */
+  *icc_data_len = 0;
+
+  /* This first pass over the saved markers discovers whether there are
+   * any ICC markers and verifies the consistency of the marker numbering.
+   */
+
+  for (seq_no = 1; seq_no <= MAX_SEQ_NO; seq_no++)
+    marker_present[seq_no] = 0;
+
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      if (num_markers == 0)
+        num_markers = marker->data[13];
+      else if (num_markers != marker->data[13]) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
+        return FALSE;
+      }
+      seq_no = marker->data[12];
+      if (seq_no <= 0 || seq_no > num_markers) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
+        return FALSE;
+      }
+      if (marker_present[seq_no]) {
+        WARNMS(cinfo, JWRN_BOGUS_ICC);  /* duplicate sequence numbers */
+        return FALSE;
+      }
+      marker_present[seq_no] = 1;
+      data_length[seq_no] = marker->data_length - ICC_OVERHEAD_LEN;
+    }
+  }
+
+  if (num_markers == 0)
+    return FALSE;
+
+  /* Check for missing markers, count total space needed,
+   * compute offset of each marker's part of the data.
+   */
+
+  total_length = 0;
+  for (seq_no = 1; seq_no <= num_markers; seq_no++) {
+    if (marker_present[seq_no] == 0) {
+      WARNMS(cinfo, JWRN_BOGUS_ICC);  /* missing sequence number */
+      return FALSE;
+    }
+    data_offset[seq_no] = total_length;
+    total_length += data_length[seq_no];
+  }
+
+  if (total_length == 0) {
+    WARNMS(cinfo, JWRN_BOGUS_ICC);  /* found only empty markers? */
+    return FALSE;
+  }
+
+  /* Allocate space for assembled data */
+  icc_data = (JOCTET *)malloc(total_length * sizeof(JOCTET));
+  if (icc_data == NULL)
+    ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 11);  /* oops, out of memory */
+
+  /* and fill it in */
+  for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
+    if (marker_is_icc(marker)) {
+      JOCTET FAR *src_ptr;
+      JOCTET *dst_ptr;
+      unsigned int length;
+      seq_no = marker->data[12];
+      dst_ptr = icc_data + data_offset[seq_no];
+      src_ptr = marker->data + ICC_OVERHEAD_LEN;
+      length = data_length[seq_no];
+      while (length--) {
+        *dst_ptr++ = *src_ptr++;
+      }
+    }
+  }
+
+  *icc_data_ptr = icc_data;
+  *icc_data_len = total_length;
+
+  return TRUE;
+}
diff --git a/media/libjpeg/jdinput.c b/media/libjpeg/jdinput.c
index 32a6b424e2..1bc5aff1a7 100644
--- a/media/libjpeg/jdinput.c
+++ b/media/libjpeg/jdinput.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2016, D. R. Commander.
+ * Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -33,7 +33,7 @@ typedef my_input_controller *my_inputctl_ptr;
 
 
 /* Forward declarations */
-METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
+METHODDEF(int) consume_markers(j_decompress_ptr cinfo);
 
 
 /*
@@ -41,16 +41,16 @@ METHODDEF(int) consume_markers (j_decompress_ptr cinfo);
  */
 
 LOCAL(void)
-initial_setup (j_decompress_ptr cinfo)
+initial_setup(j_decompress_ptr cinfo)
 /* Called once, when first SOS marker is reached */
 {
   int ci;
   jpeg_component_info *compptr;
 
   /* Make sure image isn't bigger than I can handle */
-  if ((long) cinfo->image_height > (long) JPEG_MAX_DIMENSION ||
-      (long) cinfo->image_width > (long) JPEG_MAX_DIMENSION)
-    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int) JPEG_MAX_DIMENSION);
+  if ((long)cinfo->image_height > (long)JPEG_MAX_DIMENSION ||
+      (long)cinfo->image_width > (long)JPEG_MAX_DIMENSION)
+    ERREXIT1(cinfo, JERR_IMAGE_TOO_BIG, (unsigned int)JPEG_MAX_DIMENSION);
 
   /* For now, precision must match compiled-in value... */
   if (cinfo->data_precision != BITS_IN_JSAMPLE)
@@ -66,8 +66,10 @@ initial_setup (j_decompress_ptr cinfo)
   cinfo->max_v_samp_factor = 1;
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
-    if (compptr->h_samp_factor<=0 || compptr->h_samp_factor>MAX_SAMP_FACTOR ||
-        compptr->v_samp_factor<=0 || compptr->v_samp_factor>MAX_SAMP_FACTOR)
+    if (compptr->h_samp_factor <= 0 ||
+        compptr->h_samp_factor > MAX_SAMP_FACTOR ||
+        compptr->v_samp_factor <= 0 ||
+        compptr->v_samp_factor > MAX_SAMP_FACTOR)
       ERREXIT(cinfo, JERR_BAD_SAMPLING);
     cinfo->max_h_samp_factor = MAX(cinfo->max_h_samp_factor,
                                    compptr->h_samp_factor);
@@ -75,10 +77,10 @@ initial_setup (j_decompress_ptr cinfo)
                                    compptr->v_samp_factor);
   }
 
-#if JPEG_LIB_VERSION >=80
-    cinfo->block_size = DCTSIZE;
-    cinfo->natural_order = jpeg_natural_order;
-    cinfo->lim_Se = DCTSIZE2-1;
+#if JPEG_LIB_VERSION >= 80
+  cinfo->block_size = DCTSIZE;
+  cinfo->natural_order = jpeg_natural_order;
+  cinfo->lim_Se = DCTSIZE2 - 1;
 #endif
 
   /* We initialize DCT_scaled_size and min_DCT_scaled_size to DCTSIZE.
@@ -101,11 +103,11 @@ initial_setup (j_decompress_ptr cinfo)
 #endif
     /* Size in DCT blocks */
     compptr->width_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->height_in_blocks = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
     /* Set the first and last MCU columns to decompress from multi-scan images.
      * By default, decompress all of the MCU columns.
      */
@@ -117,11 +119,11 @@ initial_setup (j_decompress_ptr cinfo)
      */
     /* Size in samples */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * (long) compptr->h_samp_factor,
-                    (long) cinfo->max_h_samp_factor);
+      jdiv_round_up((long)cinfo->image_width * (long)compptr->h_samp_factor,
+                    (long)cinfo->max_h_samp_factor);
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * (long) compptr->v_samp_factor,
-                    (long) cinfo->max_v_samp_factor);
+      jdiv_round_up((long)cinfo->image_height * (long)compptr->v_samp_factor,
+                    (long)cinfo->max_v_samp_factor);
     /* Mark component needed, until color conversion says otherwise */
     compptr->component_needed = TRUE;
     /* Mark no quantization table yet saved for component */
@@ -130,8 +132,8 @@ initial_setup (j_decompress_ptr cinfo)
 
   /* Compute number of fully interleaved MCU rows. */
   cinfo->total_iMCU_rows = (JDIMENSION)
-    jdiv_round_up((long) cinfo->image_height,
-                  (long) (cinfo->max_v_samp_factor*DCTSIZE));
+    jdiv_round_up((long)cinfo->image_height,
+                  (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
   /* Decide whether file contains multiple scans */
   if (cinfo->comps_in_scan < cinfo->num_components || cinfo->progressive_mode)
@@ -142,7 +144,7 @@ initial_setup (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-per_scan_setup (j_decompress_ptr cinfo)
+per_scan_setup(j_decompress_ptr cinfo)
 /* Do computations that are needed before processing a JPEG scan */
 /* cinfo->comps_in_scan and cinfo->cur_comp_info[] were set from SOS marker */
 {
@@ -167,7 +169,7 @@ per_scan_setup (j_decompress_ptr cinfo)
     /* For noninterleaved scans, it is convenient to define last_row_height
      * as the number of block rows present in the last iMCU row.
      */
-    tmp = (int) (compptr->height_in_blocks % compptr->v_samp_factor);
+    tmp = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
     if (tmp == 0) tmp = compptr->v_samp_factor;
     compptr->last_row_height = tmp;
 
@@ -184,11 +186,11 @@ per_scan_setup (j_decompress_ptr cinfo)
 
     /* Overall image size in MCUs */
     cinfo->MCUs_per_row = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width,
-                    (long) (cinfo->max_h_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width,
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     cinfo->MCU_rows_in_scan = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height,
-                    (long) (cinfo->max_v_samp_factor*DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height,
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
 
     cinfo->blocks_in_MCU = 0;
 
@@ -198,12 +200,13 @@ per_scan_setup (j_decompress_ptr cinfo)
       compptr->MCU_width = compptr->h_samp_factor;
       compptr->MCU_height = compptr->v_samp_factor;
       compptr->MCU_blocks = compptr->MCU_width * compptr->MCU_height;
-      compptr->MCU_sample_width = compptr->MCU_width * compptr->_DCT_scaled_size;
+      compptr->MCU_sample_width = compptr->MCU_width *
+                                  compptr->_DCT_scaled_size;
       /* Figure number of non-dummy blocks in last MCU column & row */
-      tmp = (int) (compptr->width_in_blocks % compptr->MCU_width);
+      tmp = (int)(compptr->width_in_blocks % compptr->MCU_width);
       if (tmp == 0) tmp = compptr->MCU_width;
       compptr->last_col_width = tmp;
-      tmp = (int) (compptr->height_in_blocks % compptr->MCU_height);
+      tmp = (int)(compptr->height_in_blocks % compptr->MCU_height);
       if (tmp == 0) tmp = compptr->MCU_height;
       compptr->last_row_height = tmp;
       /* Prepare array describing MCU composition */
@@ -231,17 +234,17 @@ per_scan_setup (j_decompress_ptr cinfo)
  * means that we have to save away the table actually used for each component.
  * We do this by copying the table at the start of the first scan containing
  * the component.
- * The JPEG spec prohibits the encoder from changing the contents of a Q-table
- * slot between scans of a component using that slot.  If the encoder does so
- * anyway, this decoder will simply use the Q-table values that were current
- * at the start of the first scan for the component.
+ * Rec. ITU-T T.81 | ISO/IEC 10918-1 prohibits the encoder from changing the
+ * contents of a Q-table slot between scans of a component using that slot.  If
+ * the encoder does so anyway, this decoder will simply use the Q-table values
+ * that were current at the start of the first scan for the component.
  *
  * The decompressor output side looks only at the saved quant tables,
  * not at the current Q-table slots.
  */
 
 LOCAL(void)
-latch_quant_tables (j_decompress_ptr cinfo)
+latch_quant_tables(j_decompress_ptr cinfo)
 {
   int ci, qtblno;
   jpeg_component_info *compptr;
@@ -259,9 +262,9 @@ latch_quant_tables (j_decompress_ptr cinfo)
       ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
     /* OK, save away the quantization table */
     qtbl = (JQUANT_TBL *)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(JQUANT_TBL));
-    MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
+    memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
     compptr->quant_table = qtbl;
   }
 }
@@ -275,7 +278,7 @@ latch_quant_tables (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_input_pass (j_decompress_ptr cinfo)
+start_input_pass(j_decompress_ptr cinfo)
 {
   per_scan_setup(cinfo);
   latch_quant_tables(cinfo);
@@ -292,7 +295,7 @@ start_input_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_input_pass (j_decompress_ptr cinfo)
+finish_input_pass(j_decompress_ptr cinfo)
 {
   cinfo->inputctl->consume_input = consume_markers;
 }
@@ -309,9 +312,9 @@ finish_input_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-consume_markers (j_decompress_ptr cinfo)
+consume_markers(j_decompress_ptr cinfo)
 {
-  my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+  my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
   int val;
 
   if (inputctl->pub.eoi_reached) /* After hitting EOI, read no further */
@@ -329,7 +332,7 @@ consume_markers (j_decompress_ptr cinfo)
        * responsible for enforcing this sequencing.
        */
     } else {                    /* 2nd or later SOS marker */
-      if (! inputctl->pub.has_multiple_scans)
+      if (!inputctl->pub.has_multiple_scans)
         ERREXIT(cinfo, JERR_EOI_EXPECTED); /* Oops, I wasn't expecting this! */
       start_input_pass(cinfo);
     }
@@ -360,16 +363,16 @@ consume_markers (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-reset_input_controller (j_decompress_ptr cinfo)
+reset_input_controller(j_decompress_ptr cinfo)
 {
-  my_inputctl_ptr inputctl = (my_inputctl_ptr) cinfo->inputctl;
+  my_inputctl_ptr inputctl = (my_inputctl_ptr)cinfo->inputctl;
 
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.has_multiple_scans = FALSE; /* "unknown" would be better */
   inputctl->pub.eoi_reached = FALSE;
   inputctl->inheaders = TRUE;
   /* Reset other modules */
-  (*cinfo->err->reset_error_mgr) ((j_common_ptr) cinfo);
+  (*cinfo->err->reset_error_mgr) ((j_common_ptr)cinfo);
   (*cinfo->marker->reset_marker_reader) (cinfo);
   /* Reset progression state -- would be cleaner if entropy decoder did this */
   cinfo->coef_bits = NULL;
@@ -382,15 +385,15 @@ reset_input_controller (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_input_controller (j_decompress_ptr cinfo)
+jinit_input_controller(j_decompress_ptr cinfo)
 {
   my_inputctl_ptr inputctl;
 
   /* Create subobject in permanent pool */
   inputctl = (my_inputctl_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                 sizeof(my_input_controller));
-  cinfo->inputctl = (struct jpeg_input_controller *) inputctl;
+  cinfo->inputctl = (struct jpeg_input_controller *)inputctl;
   /* Initialize method pointers */
   inputctl->pub.consume_input = consume_markers;
   inputctl->pub.reset_input_controller = reset_input_controller;
diff --git a/media/libjpeg/jdmainct.c b/media/libjpeg/jdmainct.c
index ebb069b0f4..f466b259f0 100644
--- a/media/libjpeg/jdmainct.c
+++ b/media/libjpeg/jdmainct.c
@@ -18,6 +18,7 @@
 
 #include "jinclude.h"
 #include "jdmainct.h"
+#include "jconfigint.h"
 
 
 /*
@@ -112,26 +113,29 @@
 
 
 /* Forward declarations */
-METHODDEF(void) process_data_simple_main
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
-METHODDEF(void) process_data_context_main
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_simple_main(j_decompress_ptr cinfo,
+                                         JSAMPARRAY output_buf,
+                                         JDIMENSION *out_row_ctr,
+                                         JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_context_main(j_decompress_ptr cinfo,
+                                          JSAMPARRAY output_buf,
+                                          JDIMENSION *out_row_ctr,
+                                          JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) process_data_crank_post
-        (j_decompress_ptr cinfo, JSAMPARRAY output_buf,
-         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail);
+METHODDEF(void) process_data_crank_post(j_decompress_ptr cinfo,
+                                        JSAMPARRAY output_buf,
+                                        JDIMENSION *out_row_ctr,
+                                        JDIMENSION out_rows_avail);
 #endif
 
 
 LOCAL(void)
-alloc_funny_pointers (j_decompress_ptr cinfo)
+alloc_funny_pointers(j_decompress_ptr cinfo)
 /* Allocate space for the funny pointer lists.
  * This is done only once, not once per pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -141,7 +145,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
    * We alloc both arrays with one call to save a few cycles.
    */
   main_ptr->xbuffer[0] = (JSAMPIMAGE)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 cinfo->num_components * 2 * sizeof(JSAMPARRAY));
   main_ptr->xbuffer[1] = main_ptr->xbuffer[0] + cinfo->num_components;
 
@@ -153,7 +157,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
      * We alloc both pointer lists with one call to save a few cycles.
      */
     xbuf = (JSAMPARRAY)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   2 * (rgroup * (M + 4)) * sizeof(JSAMPROW));
     xbuf += rgroup;             /* want one row group at negative offsets */
     main_ptr->xbuffer[0][ci] = xbuf;
@@ -164,7 +168,7 @@ alloc_funny_pointers (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-make_funny_pointers (j_decompress_ptr cinfo)
+make_funny_pointers(j_decompress_ptr cinfo)
 /* Create the funny pointer lists discussed in the comments above.
  * The actual workspace is already allocated (in main_ptr->buffer),
  * and the space for the pointer lists is allocated too.
@@ -172,7 +176,7 @@ make_funny_pointers (j_decompress_ptr cinfo)
  * This will be repeated at the beginning of each pass.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -191,8 +195,8 @@ make_funny_pointers (j_decompress_ptr cinfo)
     }
     /* In the second list, put the last four row groups in swapped order */
     for (i = 0; i < rgroup * 2; i++) {
-      xbuf1[rgroup*(M-2) + i] = buf[rgroup*M + i];
-      xbuf1[rgroup*M + i] = buf[rgroup*(M-2) + i];
+      xbuf1[rgroup * (M - 2) + i] = buf[rgroup * M + i];
+      xbuf1[rgroup * M + i] = buf[rgroup * (M - 2) + i];
     }
     /* The wraparound pointers at top and bottom will be filled later
      * (see set_wraparound_pointers, below).  Initially we want the "above"
@@ -207,13 +211,13 @@ make_funny_pointers (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-set_bottom_pointers (j_decompress_ptr cinfo)
+set_bottom_pointers(j_decompress_ptr cinfo)
 /* Change the pointer lists to duplicate the last sample row at the bottom
  * of the image.  whichptr indicates which xbuffer holds the final iMCU row.
  * Also sets rowgroups_avail to indicate number of nondummy row groups in row.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup, iMCUheight, rows_left;
   jpeg_component_info *compptr;
   JSAMPARRAY xbuf;
@@ -224,20 +228,20 @@ set_bottom_pointers (j_decompress_ptr cinfo)
     iMCUheight = compptr->v_samp_factor * compptr->_DCT_scaled_size;
     rgroup = iMCUheight / cinfo->_min_DCT_scaled_size;
     /* Count nondummy sample rows remaining for this component */
-    rows_left = (int) (compptr->downsampled_height % (JDIMENSION) iMCUheight);
+    rows_left = (int)(compptr->downsampled_height % (JDIMENSION)iMCUheight);
     if (rows_left == 0) rows_left = iMCUheight;
     /* Count nondummy row groups.  Should get same answer for each component,
      * so we need only do it once.
      */
     if (ci == 0) {
-      main_ptr->rowgroups_avail = (JDIMENSION) ((rows_left-1) / rgroup + 1);
+      main_ptr->rowgroups_avail = (JDIMENSION)((rows_left - 1) / rgroup + 1);
     }
     /* Duplicate the last real sample row rgroup*2 times; this pads out the
      * last partial rowgroup and ensures at least one full rowgroup of context.
      */
     xbuf = main_ptr->xbuffer[main_ptr->whichptr][ci];
     for (i = 0; i < rgroup * 2; i++) {
-      xbuf[rows_left + i] = xbuf[rows_left-1];
+      xbuf[rows_left + i] = xbuf[rows_left - 1];
     }
   }
 }
@@ -248,9 +252,9 @@ set_bottom_pointers (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_main(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
@@ -286,22 +290,21 @@ start_pass_main (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-process_data_simple_main (j_decompress_ptr cinfo,
-                          JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                          JDIMENSION out_rows_avail)
+process_data_simple_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                         JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   JDIMENSION rowgroups_avail;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
+  if (!main_ptr->buffer_full) {
+    if (!(*cinfo->coef->decompress_data) (cinfo, main_ptr->buffer))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
   }
 
   /* There are always min_DCT_scaled_size row groups in an iMCU row. */
-  rowgroups_avail = (JDIMENSION) cinfo->_min_DCT_scaled_size;
+  rowgroups_avail = (JDIMENSION)cinfo->_min_DCT_scaled_size;
   /* Note: at the bottom of the image, we may pass extra garbage row groups
    * to the postprocessor.  The postprocessor has to check for bottom
    * of image anyway (at row resolution), so no point in us doing it too.
@@ -326,16 +329,15 @@ process_data_simple_main (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-process_data_context_main (j_decompress_ptr cinfo,
-                           JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                           JDIMENSION out_rows_avail)
+process_data_context_main(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                          JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
 
   /* Read input data if we haven't filled the main buffer yet */
-  if (! main_ptr->buffer_full) {
-    if (! (*cinfo->coef->decompress_data) (cinfo,
-                                           main_ptr->xbuffer[main_ptr->whichptr]))
+  if (!main_ptr->buffer_full) {
+    if (!(*cinfo->coef->decompress_data) (cinfo,
+                                          main_ptr->xbuffer[main_ptr->whichptr]))
       return;                   /* suspension forced, can do nothing more */
     main_ptr->buffer_full = TRUE;       /* OK, we have an iMCU row to work with */
     main_ptr->iMCU_row_ctr++;   /* count rows received */
@@ -349,31 +351,35 @@ process_data_context_main (j_decompress_ptr cinfo,
   switch (main_ptr->context_state) {
   case CTX_POSTPONED_ROW:
     /* Call postprocessor using previously set pointers for postponed row */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-                        output_buf, out_row_ctr, out_rows_avail);
+    (*cinfo->post->post_process_data) (cinfo,
+                                       main_ptr->xbuffer[main_ptr->whichptr],
+                                       &main_ptr->rowgroup_ctr,
+                                       main_ptr->rowgroups_avail, output_buf,
+                                       out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     main_ptr->context_state = CTX_PREPARE_FOR_IMCU;
     if (*out_row_ctr >= out_rows_avail)
       return;                   /* Postprocessor exactly filled output buf */
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PREPARE_FOR_IMCU:
     /* Prepare to process first M-1 row groups of this iMCU row */
     main_ptr->rowgroup_ctr = 0;
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size - 1);
+    main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size - 1);
     /* Check for bottom of image: if so, tweak pointers to "duplicate"
      * the last sample row, and adjust rowgroups_avail to ignore padding rows.
      */
     if (main_ptr->iMCU_row_ctr == cinfo->total_iMCU_rows)
       set_bottom_pointers(cinfo);
     main_ptr->context_state = CTX_PROCESS_IMCU;
-    /*FALLTHROUGH*/
+    FALLTHROUGH                 /*FALLTHROUGH*/
   case CTX_PROCESS_IMCU:
     /* Call postprocessor using previously set pointers */
-    (*cinfo->post->post_process_data) (cinfo, main_ptr->xbuffer[main_ptr->whichptr],
-                        &main_ptr->rowgroup_ctr, main_ptr->rowgroups_avail,
-                        output_buf, out_row_ctr, out_rows_avail);
+    (*cinfo->post->post_process_data) (cinfo,
+                                       main_ptr->xbuffer[main_ptr->whichptr],
+                                       &main_ptr->rowgroup_ctr,
+                                       main_ptr->rowgroups_avail, output_buf,
+                                       out_row_ctr, out_rows_avail);
     if (main_ptr->rowgroup_ctr < main_ptr->rowgroups_avail)
       return;                   /* Need to suspend */
     /* After the first iMCU, change wraparound pointers to normal state */
@@ -384,8 +390,8 @@ process_data_context_main (j_decompress_ptr cinfo,
     main_ptr->buffer_full = FALSE;
     /* Still need to process last row group of this iMCU row, */
     /* which is saved at index M+1 of the other xbuffer */
-    main_ptr->rowgroup_ctr = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 1);
-    main_ptr->rowgroups_avail = (JDIMENSION) (cinfo->_min_DCT_scaled_size + 2);
+    main_ptr->rowgroup_ctr = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 1);
+    main_ptr->rowgroups_avail = (JDIMENSION)(cinfo->_min_DCT_scaled_size + 2);
     main_ptr->context_state = CTX_POSTPONED_ROW;
   }
 }
@@ -400,12 +406,11 @@ process_data_context_main (j_decompress_ptr cinfo,
 #ifdef QUANT_2PASS_SUPPORTED
 
 METHODDEF(void)
-process_data_crank_post (j_decompress_ptr cinfo,
-                         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                         JDIMENSION out_rows_avail)
+process_data_crank_post(j_decompress_ptr cinfo, JSAMPARRAY output_buf,
+                        JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE) NULL,
-                                     (JDIMENSION *) NULL, (JDIMENSION) 0,
+  (*cinfo->post->post_process_data) (cinfo, (JSAMPIMAGE)NULL,
+                                     (JDIMENSION *)NULL, (JDIMENSION)0,
                                      output_buf, out_row_ctr, out_rows_avail);
 }
 
@@ -417,16 +422,16 @@ process_data_crank_post (j_decompress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_main_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_main_ptr main_ptr;
   int ci, rgroup, ngroups;
   jpeg_component_info *compptr;
 
   main_ptr = (my_main_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_main_controller));
-  cinfo->main = (struct jpeg_d_main_controller *) main_ptr;
+  cinfo->main = (struct jpeg_d_main_controller *)main_ptr;
   main_ptr->pub.start_pass = start_pass_main;
 
   if (need_full_buffer)         /* shouldn't happen */
@@ -449,8 +454,8 @@ jinit_d_main_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     rgroup = (compptr->v_samp_factor * compptr->_DCT_scaled_size) /
       cinfo->_min_DCT_scaled_size; /* height of a row group of component */
     main_ptr->buffer[ci] = (*cinfo->mem->alloc_sarray)
-                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+                        ((j_common_ptr)cinfo, JPOOL_IMAGE,
                          compptr->width_in_blocks * compptr->_DCT_scaled_size,
-                         (JDIMENSION) (rgroup * ngroups));
+                         (JDIMENSION)(rgroup * ngroups));
   }
 }
diff --git a/media/libjpeg/jdmainct.h b/media/libjpeg/jdmainct.h
index 30903019ca..37b201ca88 100644
--- a/media/libjpeg/jdmainct.h
+++ b/media/libjpeg/jdmainct.h
@@ -44,12 +44,12 @@ typedef my_main_controller *my_main_ptr;
 
 
 LOCAL(void)
-set_wraparound_pointers (j_decompress_ptr cinfo)
+set_wraparound_pointers(j_decompress_ptr cinfo)
 /* Set up the "wraparound" pointers at top and bottom of the pointer lists.
  * This changes the pointer list state from top-of-image to the normal state.
  */
 {
-  my_main_ptr main_ptr = (my_main_ptr) cinfo->main;
+  my_main_ptr main_ptr = (my_main_ptr)cinfo->main;
   int ci, i, rgroup;
   int M = cinfo->_min_DCT_scaled_size;
   jpeg_component_info *compptr;
@@ -62,10 +62,10 @@ set_wraparound_pointers (j_decompress_ptr cinfo)
     xbuf0 = main_ptr->xbuffer[0][ci];
     xbuf1 = main_ptr->xbuffer[1][ci];
     for (i = 0; i < rgroup; i++) {
-      xbuf0[i - rgroup] = xbuf0[rgroup*(M+1) + i];
-      xbuf1[i - rgroup] = xbuf1[rgroup*(M+1) + i];
-      xbuf0[rgroup*(M+2) + i] = xbuf0[i];
-      xbuf1[rgroup*(M+2) + i] = xbuf1[i];
+      xbuf0[i - rgroup] = xbuf0[rgroup * (M + 1) + i];
+      xbuf1[i - rgroup] = xbuf1[rgroup * (M + 1) + i];
+      xbuf0[rgroup * (M + 2) + i] = xbuf0[i];
+      xbuf1[rgroup * (M + 2) + i] = xbuf1[i];
     }
   }
 }
diff --git a/media/libjpeg/jdmarker.c b/media/libjpeg/jdmarker.c
index e3b612c9b9..f7eba615fd 100644
--- a/media/libjpeg/jdmarker.c
+++ b/media/libjpeg/jdmarker.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2012, 2015, D. R. Commander.
+ * Copyright (C) 2012, 2015, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -119,50 +119,50 @@ typedef my_marker_reader *my_marker_ptr;
  */
 
 /* Declare and initialize local copies of input pointer/count */
-#define INPUT_VARS(cinfo)  \
-        struct jpeg_source_mgr *datasrc = (cinfo)->src;  \
-        const JOCTET *next_input_byte = datasrc->next_input_byte;  \
-        size_t bytes_in_buffer = datasrc->bytes_in_buffer
+#define INPUT_VARS(cinfo) \
+  struct jpeg_source_mgr *datasrc = (cinfo)->src; \
+  const JOCTET *next_input_byte = datasrc->next_input_byte; \
+  size_t bytes_in_buffer = datasrc->bytes_in_buffer
 
 /* Unload the local copies --- do this only at a restart boundary */
-#define INPUT_SYNC(cinfo)  \
-        ( datasrc->next_input_byte = next_input_byte,  \
-          datasrc->bytes_in_buffer = bytes_in_buffer )
+#define INPUT_SYNC(cinfo) \
+  ( datasrc->next_input_byte = next_input_byte, \
+    datasrc->bytes_in_buffer = bytes_in_buffer )
 
 /* Reload the local copies --- used only in MAKE_BYTE_AVAIL */
-#define INPUT_RELOAD(cinfo)  \
-        ( next_input_byte = datasrc->next_input_byte,  \
-          bytes_in_buffer = datasrc->bytes_in_buffer )
+#define INPUT_RELOAD(cinfo) \
+  ( next_input_byte = datasrc->next_input_byte, \
+    bytes_in_buffer = datasrc->bytes_in_buffer )
 
 /* Internal macro for INPUT_BYTE and INPUT_2BYTES: make a byte available.
  * Note we do *not* do INPUT_SYNC before calling fill_input_buffer,
  * but we must reload the local copies after a successful fill.
  */
-#define MAKE_BYTE_AVAIL(cinfo,action)  \
-        if (bytes_in_buffer == 0) {  \
-          if (! (*datasrc->fill_input_buffer) (cinfo))  \
-            { action; }  \
-          INPUT_RELOAD(cinfo);  \
-        }
+#define MAKE_BYTE_AVAIL(cinfo, action) \
+  if (bytes_in_buffer == 0) { \
+    if (!(*datasrc->fill_input_buffer) (cinfo)) \
+      { action; } \
+    INPUT_RELOAD(cinfo); \
+  }
 
 /* Read a byte into variable V.
  * If must suspend, take the specified action (typically "return FALSE").
  */
-#define INPUT_BYTE(cinfo,V,action)  \
-        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V = GETJOCTET(*next_input_byte++); )
+#define INPUT_BYTE(cinfo, V, action) \
+  MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V = *next_input_byte++; )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
  * V should be declared unsigned int or perhaps JLONG.
  */
-#define INPUT_2BYTES(cinfo,V,action)  \
-        MAKESTMT( MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V = ((unsigned int) GETJOCTET(*next_input_byte++)) << 8; \
-                  MAKE_BYTE_AVAIL(cinfo,action); \
-                  bytes_in_buffer--; \
-                  V += GETJOCTET(*next_input_byte++); )
+#define INPUT_2BYTES(cinfo, V, action) \
+  MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V = ((unsigned int)(*next_input_byte++)) << 8; \
+            MAKE_BYTE_AVAIL(cinfo, action); \
+            bytes_in_buffer--; \
+            V += *next_input_byte++; )
 
 
 /*
@@ -197,7 +197,7 @@ typedef my_marker_reader *my_marker_ptr;
 
 
 LOCAL(boolean)
-get_soi (j_decompress_ptr cinfo)
+get_soi(j_decompress_ptr cinfo)
 /* Process an SOI marker */
 {
   int i;
@@ -237,7 +237,7 @@ get_soi (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
+get_sof(j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
 /* Process a SOFn marker */
 {
   JLONG length;
@@ -258,7 +258,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
   length -= 8;
 
   TRACEMS4(cinfo, 1, JTRC_SOF, cinfo->unread_marker,
-           (int) cinfo->image_width, (int) cinfo->image_height,
+           (int)cinfo->image_width, (int)cinfo->image_height,
            cinfo->num_components);
 
   if (cinfo->marker->saw_SOF)
@@ -267,16 +267,16 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
   /* We don't support files in which the image height is initially specified */
   /* as 0 and is later redefined by DNL.  As long as we have to check that,  */
   /* might as well have a general sanity check. */
-  if (cinfo->image_height <= 0 || cinfo->image_width <= 0
-      || cinfo->num_components <= 0)
+  if (cinfo->image_height <= 0 || cinfo->image_width <= 0 ||
+      cinfo->num_components <= 0)
     ERREXIT(cinfo, JERR_EMPTY_IMAGE);
 
   if (length != (cinfo->num_components * 3))
     ERREXIT(cinfo, JERR_BAD_LENGTH);
 
   if (cinfo->comp_info == NULL) /* do only once, even if suspend */
-    cinfo->comp_info = (jpeg_component_info *) (*cinfo->mem->alloc_small)
-                        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    cinfo->comp_info = (jpeg_component_info *)(*cinfo->mem->alloc_small)
+                        ((j_common_ptr)cinfo, JPOOL_IMAGE,
                          cinfo->num_components * sizeof(jpeg_component_info));
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
@@ -301,7 +301,7 @@ get_sof (j_decompress_ptr cinfo, boolean is_prog, boolean is_arith)
 
 
 LOCAL(boolean)
-get_sos (j_decompress_ptr cinfo)
+get_sos(j_decompress_ptr cinfo)
 /* Process a SOS marker */
 {
   JLONG length;
@@ -309,7 +309,7 @@ get_sos (j_decompress_ptr cinfo)
   jpeg_component_info *compptr;
   INPUT_VARS(cinfo);
 
-  if (! cinfo->marker->saw_SOF)
+  if (!cinfo->marker->saw_SOF)
     ERREXIT(cinfo, JERR_SOS_NO_SOF);
 
   INPUT_2BYTES(cinfo, length, return FALSE);
@@ -341,7 +341,7 @@ get_sos (j_decompress_ptr cinfo)
 
     ERREXIT1(cinfo, JERR_BAD_COMPONENT_ID, cc);
 
-  id_found:
+id_found:
 
     cinfo->cur_comp_info[i] = compptr;
     compptr->dc_tbl_no = (c >> 4) & 15;
@@ -384,7 +384,7 @@ get_sos (j_decompress_ptr cinfo)
 #ifdef D_ARITH_CODING_SUPPORTED
 
 LOCAL(boolean)
-get_dac (j_decompress_ptr cinfo)
+get_dac(j_decompress_ptr cinfo)
 /* Process a DAC marker */
 {
   JLONG length;
@@ -402,14 +402,14 @@ get_dac (j_decompress_ptr cinfo)
 
     TRACEMS2(cinfo, 1, JTRC_DAC, index, val);
 
-    if (index < 0 || index >= (2*NUM_ARITH_TBLS))
+    if (index < 0 || index >= (2 * NUM_ARITH_TBLS))
       ERREXIT1(cinfo, JERR_DAC_INDEX, index);
 
     if (index >= NUM_ARITH_TBLS) { /* define AC table */
-      cinfo->arith_ac_K[index-NUM_ARITH_TBLS] = (UINT8) val;
+      cinfo->arith_ac_K[index - NUM_ARITH_TBLS] = (UINT8)val;
     } else {                    /* define DC table */
-      cinfo->arith_dc_L[index] = (UINT8) (val & 0x0F);
-      cinfo->arith_dc_U[index] = (UINT8) (val >> 4);
+      cinfo->arith_dc_L[index] = (UINT8)(val & 0x0F);
+      cinfo->arith_dc_U[index] = (UINT8)(val >> 4);
       if (cinfo->arith_dc_L[index] > cinfo->arith_dc_U[index])
         ERREXIT1(cinfo, JERR_DAC_VALUE, val);
     }
@@ -422,7 +422,7 @@ get_dac (j_decompress_ptr cinfo)
   return TRUE;
 }
 
-#else /* ! D_ARITH_CODING_SUPPORTED */
+#else /* !D_ARITH_CODING_SUPPORTED */
 
 #define get_dac(cinfo)  skip_variable(cinfo)
 
@@ -430,7 +430,7 @@ get_dac (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_dht (j_decompress_ptr cinfo)
+get_dht(j_decompress_ptr cinfo)
 /* Process a DHT marker */
 {
   JLONG length;
@@ -467,13 +467,13 @@ get_dht (j_decompress_ptr cinfo)
     /* Here we just do minimal validation of the counts to avoid walking
      * off the end of our table space.  jdhuff.c will check more carefully.
      */
-    if (count > 256 || ((JLONG) count) > length)
+    if (count > 256 || ((JLONG)count) > length)
       ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
     for (i = 0; i < count; i++)
       INPUT_BYTE(cinfo, huffval[i], return FALSE);
 
-    MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8));
+    memset(&huffval[count], 0, (256 - count) * sizeof(UINT8));
 
     length -= count;
 
@@ -489,10 +489,10 @@ get_dht (j_decompress_ptr cinfo)
     }
 
     if (*htblptr == NULL)
-      *htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
+      *htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
 
-    MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
-    MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
+    memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+    memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
   }
 
   if (length != 0)
@@ -504,7 +504,7 @@ get_dht (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_dqt (j_decompress_ptr cinfo)
+get_dqt(j_decompress_ptr cinfo)
 /* Process a DQT marker */
 {
   JLONG length;
@@ -527,7 +527,7 @@ get_dqt (j_decompress_ptr cinfo)
       ERREXIT1(cinfo, JERR_DQT_INDEX, n);
 
     if (cinfo->quant_tbl_ptrs[n] == NULL)
-      cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) cinfo);
+      cinfo->quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr)cinfo);
     quant_ptr = cinfo->quant_tbl_ptrs[n];
 
     for (i = 0; i < DCTSIZE2; i++) {
@@ -536,20 +536,20 @@ get_dqt (j_decompress_ptr cinfo)
       else
         INPUT_BYTE(cinfo, tmp, return FALSE);
       /* We convert the zigzag-order table to natural array order. */
-      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16) tmp;
+      quant_ptr->quantval[jpeg_natural_order[i]] = (UINT16)tmp;
     }
 
     if (cinfo->err->trace_level >= 2) {
       for (i = 0; i < DCTSIZE2; i += 8) {
         TRACEMS8(cinfo, 2, JTRC_QUANTVALS,
-                 quant_ptr->quantval[i],   quant_ptr->quantval[i+1],
-                 quant_ptr->quantval[i+2], quant_ptr->quantval[i+3],
-                 quant_ptr->quantval[i+4], quant_ptr->quantval[i+5],
-                 quant_ptr->quantval[i+6], quant_ptr->quantval[i+7]);
+                 quant_ptr->quantval[i],     quant_ptr->quantval[i + 1],
+                 quant_ptr->quantval[i + 2], quant_ptr->quantval[i + 3],
+                 quant_ptr->quantval[i + 4], quant_ptr->quantval[i + 5],
+                 quant_ptr->quantval[i + 6], quant_ptr->quantval[i + 7]);
       }
     }
 
-    length -= DCTSIZE2+1;
+    length -= DCTSIZE2 + 1;
     if (prec) length -= DCTSIZE2;
   }
 
@@ -562,7 +562,7 @@ get_dqt (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-get_dri (j_decompress_ptr cinfo)
+get_dri(j_decompress_ptr cinfo)
 /* Process a DRI marker */
 {
   JLONG length;
@@ -598,28 +598,28 @@ get_dri (j_decompress_ptr cinfo)
 
 
 LOCAL(void)
-examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
-              unsigned int datalen, JLONG remaining)
+examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+             JLONG remaining)
 /* Examine first few bytes from an APP0.
  * Take appropriate action if it is a JFIF marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
  */
 {
-  JLONG totallen = (JLONG) datalen + remaining;
+  JLONG totallen = (JLONG)datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x49 &&
-      GETJOCTET(data[3]) == 0x46 &&
-      GETJOCTET(data[4]) == 0) {
+      data[0] == 0x4A &&
+      data[1] == 0x46 &&
+      data[2] == 0x49 &&
+      data[3] == 0x46 &&
+      data[4] == 0) {
     /* Found JFIF APP0 marker: save info */
     cinfo->saw_JFIF_marker = TRUE;
-    cinfo->JFIF_major_version = GETJOCTET(data[5]);
-    cinfo->JFIF_minor_version = GETJOCTET(data[6]);
-    cinfo->density_unit = GETJOCTET(data[7]);
-    cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]);
-    cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]);
+    cinfo->JFIF_major_version = data[5];
+    cinfo->JFIF_minor_version = data[6];
+    cinfo->density_unit = data[7];
+    cinfo->X_density = (data[8] << 8) + data[9];
+    cinfo->Y_density = (data[10] << 8) + data[11];
     /* Check version.
      * Major version must be 1, anything else signals an incompatible change.
      * (We used to treat this as an error, but now it's a nonfatal warning,
@@ -634,48 +634,45 @@ examine_app0 (j_decompress_ptr cinfo, JOCTET *data,
              cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
              cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
     /* Validate thumbnail dimensions and issue appropriate messages */
-    if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
-      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
-               GETJOCTET(data[12]), GETJOCTET(data[13]));
+    if (data[12] | data[13])
+      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
     totallen -= APP0_DATA_LEN;
-    if (totallen !=
-        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG) 3))
-      TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int) totallen);
+    if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
+      TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
   } else if (datalen >= 6 &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x58 &&
-      GETJOCTET(data[3]) == 0x58 &&
-      GETJOCTET(data[4]) == 0) {
+             data[0] == 0x4A &&
+             data[1] == 0x46 &&
+             data[2] == 0x58 &&
+             data[3] == 0x58 &&
+             data[4] == 0) {
     /* Found JFIF "JFXX" extension APP0 marker */
     /* The library doesn't actually do anything with these,
      * but we try to produce a helpful trace message.
      */
-    switch (GETJOCTET(data[5])) {
+    switch (data[5]) {
     case 0x10:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
       break;
     case 0x11:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_PALETTE, (int)totallen);
       break;
     case 0x13:
-      TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int) totallen);
+      TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
       break;
     default:
-      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-               GETJOCTET(data[5]), (int) totallen);
+      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
       break;
     }
   } else {
     /* Start of APP0 does not match "JFIF" or "JFXX", or too short */
-    TRACEMS1(cinfo, 1, JTRC_APP0, (int) totallen);
+    TRACEMS1(cinfo, 1, JTRC_APP0, (int)totallen);
   }
 }
 
 
 LOCAL(void)
-examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
-               unsigned int datalen, JLONG remaining)
+examine_app14(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
+              JLONG remaining)
 /* Examine first few bytes from an APP14.
  * Take appropriate action if it is an Adobe marker.
  * datalen is # of bytes at data[], remaining is length of rest of marker data.
@@ -684,28 +681,28 @@ examine_app14 (j_decompress_ptr cinfo, JOCTET *data,
   unsigned int version, flags0, flags1, transform;
 
   if (datalen >= APP14_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x41 &&
-      GETJOCTET(data[1]) == 0x64 &&
-      GETJOCTET(data[2]) == 0x6F &&
-      GETJOCTET(data[3]) == 0x62 &&
-      GETJOCTET(data[4]) == 0x65) {
+      data[0] == 0x41 &&
+      data[1] == 0x64 &&
+      data[2] == 0x6F &&
+      data[3] == 0x62 &&
+      data[4] == 0x65) {
     /* Found Adobe APP14 marker */
-    version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]);
-    flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]);
-    flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]);
-    transform = GETJOCTET(data[11]);
+    version = (data[5] << 8) + data[6];
+    flags0 = (data[7] << 8) + data[8];
+    flags1 = (data[9] << 8) + data[10];
+    transform = data[11];
     TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
     cinfo->saw_Adobe_marker = TRUE;
-    cinfo->Adobe_transform = (UINT8) transform;
+    cinfo->Adobe_transform = (UINT8)transform;
   } else {
     /* Start of APP14 does not match "Adobe", or too short */
-    TRACEMS1(cinfo, 1, JTRC_APP14, (int) (datalen + remaining));
+    TRACEMS1(cinfo, 1, JTRC_APP14, (int)(datalen + remaining));
   }
 }
 
 
 METHODDEF(boolean)
-get_interesting_appn (j_decompress_ptr cinfo)
+get_interesting_appn(j_decompress_ptr cinfo)
 /* Process an APP0 or APP14 marker without saving it */
 {
   JLONG length;
@@ -720,7 +717,7 @@ get_interesting_appn (j_decompress_ptr cinfo)
   if (length >= APPN_DATA_LEN)
     numtoread = APPN_DATA_LEN;
   else if (length > 0)
-    numtoread = (unsigned int) length;
+    numtoread = (unsigned int)length;
   else
     numtoread = 0;
   for (i = 0; i < numtoread; i++)
@@ -730,10 +727,10 @@ get_interesting_appn (j_decompress_ptr cinfo)
   /* process it */
   switch (cinfo->unread_marker) {
   case M_APP0:
-    examine_app0(cinfo, (JOCTET *) b, numtoread, length);
+    examine_app0(cinfo, (JOCTET *)b, numtoread, length);
     break;
   case M_APP14:
-    examine_app14(cinfo, (JOCTET *) b, numtoread, length);
+    examine_app14(cinfo, (JOCTET *)b, numtoread, length);
     break;
   default:
     /* can't get here unless jpeg_save_markers chooses wrong processor */
@@ -744,7 +741,7 @@ get_interesting_appn (j_decompress_ptr cinfo)
   /* skip any remaining data -- could be lots */
   INPUT_SYNC(cinfo);
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -753,10 +750,10 @@ get_interesting_appn (j_decompress_ptr cinfo)
 #ifdef SAVE_MARKERS_SUPPORTED
 
 METHODDEF(boolean)
-save_marker (j_decompress_ptr cinfo)
+save_marker(j_decompress_ptr cinfo)
 /* Save an APPn or COM marker into the marker list */
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   jpeg_saved_marker_ptr cur_marker = marker->cur_marker;
   unsigned int bytes_read, data_length;
   JOCTET *data;
@@ -770,22 +767,22 @@ save_marker (j_decompress_ptr cinfo)
     if (length >= 0) {          /* watch out for bogus length word */
       /* figure out how much we want to save */
       unsigned int limit;
-      if (cinfo->unread_marker == (int) M_COM)
+      if (cinfo->unread_marker == (int)M_COM)
         limit = marker->length_limit_COM;
       else
-        limit = marker->length_limit_APPn[cinfo->unread_marker - (int) M_APP0];
-      if ((unsigned int) length < limit)
-        limit = (unsigned int) length;
+        limit = marker->length_limit_APPn[cinfo->unread_marker - (int)M_APP0];
+      if ((unsigned int)length < limit)
+        limit = (unsigned int)length;
       /* allocate and initialize the marker item */
       cur_marker = (jpeg_saved_marker_ptr)
-        (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                     sizeof(struct jpeg_marker_struct) + limit);
       cur_marker->next = NULL;
-      cur_marker->marker = (UINT8) cinfo->unread_marker;
-      cur_marker->original_length = (unsigned int) length;
+      cur_marker->marker = (UINT8)cinfo->unread_marker;
+      cur_marker->original_length = (unsigned int)length;
       cur_marker->data_length = limit;
       /* data area is just beyond the jpeg_marker_struct */
-      data = cur_marker->data = (JOCTET *) (cur_marker + 1);
+      data = cur_marker->data = (JOCTET *)(cur_marker + 1);
       marker->cur_marker = cur_marker;
       marker->bytes_read = 0;
       bytes_read = 0;
@@ -843,14 +840,14 @@ save_marker (j_decompress_ptr cinfo)
     break;
   default:
     TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker,
-             (int) (data_length + length));
+             (int)(data_length + length));
     break;
   }
 
   /* skip any remaining data -- could be lots */
   INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -859,7 +856,7 @@ save_marker (j_decompress_ptr cinfo)
 
 
 METHODDEF(boolean)
-skip_variable (j_decompress_ptr cinfo)
+skip_variable(j_decompress_ptr cinfo)
 /* Skip over an unknown or uninteresting variable-length marker */
 {
   JLONG length;
@@ -868,11 +865,11 @@ skip_variable (j_decompress_ptr cinfo)
   INPUT_2BYTES(cinfo, length, return FALSE);
   length -= 2;
 
-  TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int) length);
+  TRACEMS2(cinfo, 1, JTRC_MISC_MARKER, cinfo->unread_marker, (int)length);
 
   INPUT_SYNC(cinfo);            /* do before skip_input_data */
   if (length > 0)
-    (*cinfo->src->skip_input_data) (cinfo, (long) length);
+    (*cinfo->src->skip_input_data) (cinfo, (long)length);
 
   return TRUE;
 }
@@ -888,7 +885,7 @@ skip_variable (j_decompress_ptr cinfo)
  */
 
 LOCAL(boolean)
-next_marker (j_decompress_ptr cinfo)
+next_marker(j_decompress_ptr cinfo)
 {
   int c;
   INPUT_VARS(cinfo);
@@ -935,7 +932,7 @@ next_marker (j_decompress_ptr cinfo)
 
 
 LOCAL(boolean)
-first_marker (j_decompress_ptr cinfo)
+first_marker(j_decompress_ptr cinfo)
 /* Like next_marker, but used to obtain the initial SOI marker. */
 /* For this marker, we do not allow preceding garbage or fill; otherwise,
  * we might well scan an entire input file before realizing it ain't JPEG.
@@ -948,7 +945,7 @@ first_marker (j_decompress_ptr cinfo)
 
   INPUT_BYTE(cinfo, c, return FALSE);
   INPUT_BYTE(cinfo, c2, return FALSE);
-  if (c != 0xFF || c2 != (int) M_SOI)
+  if (c != 0xFF || c2 != (int)M_SOI)
     ERREXIT2(cinfo, JERR_NO_SOI, c, c2);
 
   cinfo->unread_marker = c2;
@@ -966,18 +963,18 @@ first_marker (j_decompress_ptr cinfo)
  */
 
 METHODDEF(int)
-read_markers (j_decompress_ptr cinfo)
+read_markers(j_decompress_ptr cinfo)
 {
   /* Outer loop repeats once for each marker. */
   for (;;) {
     /* Collect the marker proper, unless we already did. */
     /* NB: first_marker() enforces the requirement that SOI appear first. */
     if (cinfo->unread_marker == 0) {
-      if (! cinfo->marker->saw_SOI) {
-        if (! first_marker(cinfo))
+      if (!cinfo->marker->saw_SOI) {
+        if (!first_marker(cinfo))
           return JPEG_SUSPENDED;
       } else {
-        if (! next_marker(cinfo))
+        if (!next_marker(cinfo))
           return JPEG_SUSPENDED;
       }
     }
@@ -987,28 +984,28 @@ read_markers (j_decompress_ptr cinfo)
      */
     switch (cinfo->unread_marker) {
     case M_SOI:
-      if (! get_soi(cinfo))
+      if (!get_soi(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF0:                /* Baseline */
     case M_SOF1:                /* Extended sequential, Huffman */
-      if (! get_sof(cinfo, FALSE, FALSE))
+      if (!get_sof(cinfo, FALSE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF2:                /* Progressive, Huffman */
-      if (! get_sof(cinfo, TRUE, FALSE))
+      if (!get_sof(cinfo, TRUE, FALSE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF9:                /* Extended sequential, arithmetic */
-      if (! get_sof(cinfo, FALSE, TRUE))
+      if (!get_sof(cinfo, FALSE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
     case M_SOF10:               /* Progressive, arithmetic */
-      if (! get_sof(cinfo, TRUE, TRUE))
+      if (!get_sof(cinfo, TRUE, TRUE))
         return JPEG_SUSPENDED;
       break;
 
@@ -1026,7 +1023,7 @@ read_markers (j_decompress_ptr cinfo)
       break;
 
     case M_SOS:
-      if (! get_sos(cinfo))
+      if (!get_sos(cinfo))
         return JPEG_SUSPENDED;
       cinfo->unread_marker = 0; /* processed the marker */
       return JPEG_REACHED_SOS;
@@ -1037,22 +1034,22 @@ read_markers (j_decompress_ptr cinfo)
       return JPEG_REACHED_EOI;
 
     case M_DAC:
-      if (! get_dac(cinfo))
+      if (!get_dac(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DHT:
-      if (! get_dht(cinfo))
+      if (!get_dht(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DQT:
-      if (! get_dqt(cinfo))
+      if (!get_dqt(cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_DRI:
-      if (! get_dri(cinfo))
+      if (!get_dri(cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1072,13 +1069,13 @@ read_markers (j_decompress_ptr cinfo)
     case M_APP13:
     case M_APP14:
     case M_APP15:
-      if (! (*((my_marker_ptr) cinfo->marker)->process_APPn[
-                cinfo->unread_marker - (int) M_APP0]) (cinfo))
+      if (!(*((my_marker_ptr)cinfo->marker)->process_APPn[
+               cinfo->unread_marker - (int)M_APP0]) (cinfo))
         return JPEG_SUSPENDED;
       break;
 
     case M_COM:
-      if (! (*((my_marker_ptr) cinfo->marker)->process_COM) (cinfo))
+      if (!(*((my_marker_ptr)cinfo->marker)->process_COM) (cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1095,7 +1092,7 @@ read_markers (j_decompress_ptr cinfo)
       break;
 
     case M_DNL:                 /* Ignore DNL ... perhaps the wrong thing */
-      if (! skip_variable(cinfo))
+      if (!skip_variable(cinfo))
         return JPEG_SUSPENDED;
       break;
 
@@ -1127,25 +1124,25 @@ read_markers (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-read_restart_marker (j_decompress_ptr cinfo)
+read_restart_marker(j_decompress_ptr cinfo)
 {
   /* Obtain a marker unless we already did. */
   /* Note that next_marker will complain if it skips any data. */
   if (cinfo->unread_marker == 0) {
-    if (! next_marker(cinfo))
+    if (!next_marker(cinfo))
       return FALSE;
   }
 
   if (cinfo->unread_marker ==
-      ((int) M_RST0 + cinfo->marker->next_restart_num)) {
+      ((int)M_RST0 + cinfo->marker->next_restart_num)) {
     /* Normal case --- swallow the marker and let entropy decoder continue */
     TRACEMS1(cinfo, 3, JTRC_RST, cinfo->marker->next_restart_num);
     cinfo->unread_marker = 0;
   } else {
     /* Uh-oh, the restart markers have been messed up. */
     /* Let the data source manager determine how to resync. */
-    if (! (*cinfo->src->resync_to_restart) (cinfo,
-                                            cinfo->marker->next_restart_num))
+    if (!(*cinfo->src->resync_to_restart) (cinfo,
+                                           cinfo->marker->next_restart_num))
       return FALSE;
   }
 
@@ -1206,7 +1203,7 @@ read_restart_marker (j_decompress_ptr cinfo)
  */
 
 GLOBAL(boolean)
-jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
+jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired)
 {
   int marker = cinfo->unread_marker;
   int action = 1;
@@ -1216,16 +1213,16 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
 
   /* Outer loop handles repeated decision after scanning forward. */
   for (;;) {
-    if (marker < (int) M_SOF0)
+    if (marker < (int)M_SOF0)
       action = 2;               /* invalid marker */
-    else if (marker < (int) M_RST0 || marker > (int) M_RST7)
+    else if (marker < (int)M_RST0 || marker > (int)M_RST7)
       action = 3;               /* valid non-restart marker */
     else {
-      if (marker == ((int) M_RST0 + ((desired+1) & 7)) ||
-          marker == ((int) M_RST0 + ((desired+2) & 7)))
+      if (marker == ((int)M_RST0 + ((desired + 1) & 7)) ||
+          marker == ((int)M_RST0 + ((desired + 2) & 7)))
         action = 3;             /* one of the next two expected restarts */
-      else if (marker == ((int) M_RST0 + ((desired-1) & 7)) ||
-               marker == ((int) M_RST0 + ((desired-2) & 7)))
+      else if (marker == ((int)M_RST0 + ((desired - 1) & 7)) ||
+               marker == ((int)M_RST0 + ((desired - 2) & 7)))
         action = 2;             /* a prior restart, so advance */
       else
         action = 1;             /* desired restart or too far away */
@@ -1238,7 +1235,7 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
       return TRUE;
     case 2:
       /* Scan to the next marker, and repeat the decision loop. */
-      if (! next_marker(cinfo))
+      if (!next_marker(cinfo))
         return FALSE;
       marker = cinfo->unread_marker;
       break;
@@ -1256,9 +1253,9 @@ jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired)
  */
 
 METHODDEF(void)
-reset_marker_reader (j_decompress_ptr cinfo)
+reset_marker_reader(j_decompress_ptr cinfo)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
   cinfo->comp_info = NULL;              /* until allocated by get_sof */
   cinfo->input_scan_number = 0;         /* no SOS seen yet */
@@ -1276,16 +1273,16 @@ reset_marker_reader (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_marker_reader (j_decompress_ptr cinfo)
+jinit_marker_reader(j_decompress_ptr cinfo)
 {
   my_marker_ptr marker;
   int i;
 
   /* Create subobject in permanent pool */
   marker = (my_marker_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_PERMANENT,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
                                 sizeof(my_marker_reader));
-  cinfo->marker = (struct jpeg_marker_reader *) marker;
+  cinfo->marker = (struct jpeg_marker_reader *)marker;
   /* Initialize public method pointers */
   marker->pub.reset_marker_reader = reset_marker_reader;
   marker->pub.read_markers = read_markers;
@@ -1314,10 +1311,10 @@ jinit_marker_reader (j_decompress_ptr cinfo)
 #ifdef SAVE_MARKERS_SUPPORTED
 
 GLOBAL(void)
-jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
-                   unsigned int length_limit)
+jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                  unsigned int length_limit)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
   long maxlength;
   jpeg_marker_parser_method processor;
 
@@ -1325,8 +1322,8 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
    * (should only be a concern in a 16-bit environment).
    */
   maxlength = cinfo->mem->max_alloc_chunk - sizeof(struct jpeg_marker_struct);
-  if (((long) length_limit) > maxlength)
-    length_limit = (unsigned int) maxlength;
+  if (((long)length_limit) > maxlength)
+    length_limit = (unsigned int)maxlength;
 
   /* Choose processor routine to use.
    * APP0/APP14 have special requirements.
@@ -1334,23 +1331,23 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
   if (length_limit) {
     processor = save_marker;
     /* If saving APP0/APP14, save at least enough for our internal use. */
-    if (marker_code == (int) M_APP0 && length_limit < APP0_DATA_LEN)
+    if (marker_code == (int)M_APP0 && length_limit < APP0_DATA_LEN)
       length_limit = APP0_DATA_LEN;
-    else if (marker_code == (int) M_APP14 && length_limit < APP14_DATA_LEN)
+    else if (marker_code == (int)M_APP14 && length_limit < APP14_DATA_LEN)
       length_limit = APP14_DATA_LEN;
   } else {
     processor = skip_variable;
     /* If discarding APP0/APP14, use our regular on-the-fly processor. */
-    if (marker_code == (int) M_APP0 || marker_code == (int) M_APP14)
+    if (marker_code == (int)M_APP0 || marker_code == (int)M_APP14)
       processor = get_interesting_appn;
   }
 
-  if (marker_code == (int) M_COM) {
+  if (marker_code == (int)M_COM) {
     marker->process_COM = processor;
     marker->length_limit_COM = length_limit;
-  } else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15) {
-    marker->process_APPn[marker_code - (int) M_APP0] = processor;
-    marker->length_limit_APPn[marker_code - (int) M_APP0] = length_limit;
+  } else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15) {
+    marker->process_APPn[marker_code - (int)M_APP0] = processor;
+    marker->length_limit_APPn[marker_code - (int)M_APP0] = length_limit;
   } else
     ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
 }
@@ -1363,15 +1360,15 @@ jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
  */
 
 GLOBAL(void)
-jpeg_set_marker_processor (j_decompress_ptr cinfo, int marker_code,
-                           jpeg_marker_parser_method routine)
+jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                          jpeg_marker_parser_method routine)
 {
-  my_marker_ptr marker = (my_marker_ptr) cinfo->marker;
+  my_marker_ptr marker = (my_marker_ptr)cinfo->marker;
 
-  if (marker_code == (int) M_COM)
+  if (marker_code == (int)M_COM)
     marker->process_COM = routine;
-  else if (marker_code >= (int) M_APP0 && marker_code <= (int) M_APP15)
-    marker->process_APPn[marker_code - (int) M_APP0] = routine;
+  else if (marker_code >= (int)M_APP0 && marker_code <= (int)M_APP15)
+    marker->process_APPn[marker_code - (int)M_APP0] = routine;
   else
     ERREXIT1(cinfo, JERR_UNKNOWN_MARKER, marker_code);
 }
diff --git a/media/libjpeg/jdmaster.c b/media/libjpeg/jdmaster.c
index 9079dda65c..a3690bf560 100644
--- a/media/libjpeg/jdmaster.c
+++ b/media/libjpeg/jdmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -22,7 +22,6 @@
 #include "jpeglib.h"
 #include "jpegcomp.h"
 #include "jdmaster.h"
-#include "jsimd.h"
 
 
 /*
@@ -31,7 +30,7 @@
  */
 
 LOCAL(boolean)
-use_merged_upsample (j_decompress_ptr cinfo)
+use_merged_upsample(j_decompress_ptr cinfo)
 {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
   /* Merging is the equivalent of plain box-filter upsampling */
@@ -40,22 +39,22 @@ use_merged_upsample (j_decompress_ptr cinfo)
   /* jdmerge.c only supports YCC=>RGB and YCC=>RGB565 color conversion */
   if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
       (cinfo->out_color_space != JCS_RGB &&
-      cinfo->out_color_space != JCS_RGB565 &&
-      cinfo->out_color_space != JCS_EXT_RGB &&
-      cinfo->out_color_space != JCS_EXT_RGBX &&
-      cinfo->out_color_space != JCS_EXT_BGR &&
-      cinfo->out_color_space != JCS_EXT_BGRX &&
-      cinfo->out_color_space != JCS_EXT_XBGR &&
-      cinfo->out_color_space != JCS_EXT_XRGB &&
-      cinfo->out_color_space != JCS_EXT_RGBA &&
-      cinfo->out_color_space != JCS_EXT_BGRA &&
-      cinfo->out_color_space != JCS_EXT_ABGR &&
-      cinfo->out_color_space != JCS_EXT_ARGB))
+       cinfo->out_color_space != JCS_RGB565 &&
+       cinfo->out_color_space != JCS_EXT_RGB &&
+       cinfo->out_color_space != JCS_EXT_RGBX &&
+       cinfo->out_color_space != JCS_EXT_BGR &&
+       cinfo->out_color_space != JCS_EXT_BGRX &&
+       cinfo->out_color_space != JCS_EXT_XBGR &&
+       cinfo->out_color_space != JCS_EXT_XRGB &&
+       cinfo->out_color_space != JCS_EXT_RGBA &&
+       cinfo->out_color_space != JCS_EXT_BGRA &&
+       cinfo->out_color_space != JCS_EXT_ABGR &&
+       cinfo->out_color_space != JCS_EXT_ARGB))
     return FALSE;
   if ((cinfo->out_color_space == JCS_RGB565 &&
-      cinfo->out_color_components != 3) ||
+       cinfo->out_color_components != 3) ||
       (cinfo->out_color_space != JCS_RGB565 &&
-      cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
+       cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space]))
     return FALSE;
   /* and it only handles 2h1v or 2h2v sampling ratios */
   if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -70,17 +69,6 @@ use_merged_upsample (j_decompress_ptr cinfo)
       cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
       cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
-#ifdef WITH_SIMD
-  /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
-     isn't, then disabling merged upsampling is likely to be faster when
-     decompressing YCbCr JPEG images. */
-  if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
-      jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
-      (cinfo->out_color_space == JCS_RGB ||
-       (cinfo->out_color_space >= JCS_EXT_RGB &&
-        cinfo->out_color_space <= JCS_EXT_ARGB)))
-    return FALSE;
-#endif
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;                  /* by golly, it'll work... */
 #else
@@ -100,7 +88,7 @@ GLOBAL(void)
 #else
 LOCAL(void)
 #endif
-jpeg_core_output_dimensions (j_decompress_ptr cinfo)
+jpeg_core_output_dimensions(j_decompress_ptr cinfo)
 /* Do computations that are needed before master selection phase.
  * This function is used for transcoding and full decompression.
  */
@@ -113,129 +101,129 @@ jpeg_core_output_dimensions (j_decompress_ptr cinfo)
   if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom) {
     /* Provide 1/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 1;
     cinfo->_min_DCT_v_scaled_size = 1;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 2) {
     /* Provide 2/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 2L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 2L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 2L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 2L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 2;
     cinfo->_min_DCT_v_scaled_size = 2;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 3) {
     /* Provide 3/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 3L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 3L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 3L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 3L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 3;
     cinfo->_min_DCT_v_scaled_size = 3;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 4) {
     /* Provide 4/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 4L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 4L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 4L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 4L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 4;
     cinfo->_min_DCT_v_scaled_size = 4;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 5) {
     /* Provide 5/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 5L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 5L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 5L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 5L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 5;
     cinfo->_min_DCT_v_scaled_size = 5;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 6) {
     /* Provide 6/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 6L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 6L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 6L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 6L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 6;
     cinfo->_min_DCT_v_scaled_size = 6;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 7) {
     /* Provide 7/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 7L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 7L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 7L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 7L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 7;
     cinfo->_min_DCT_v_scaled_size = 7;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 8) {
     /* Provide 8/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 8L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 8L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 8L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 8L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 8;
     cinfo->_min_DCT_v_scaled_size = 8;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 9) {
     /* Provide 9/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 9L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 9L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 9L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 9L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 9;
     cinfo->_min_DCT_v_scaled_size = 9;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 10) {
     /* Provide 10/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 10L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 10L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 10L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 10L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 10;
     cinfo->_min_DCT_v_scaled_size = 10;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 11) {
     /* Provide 11/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 11L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 11L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 11L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 11L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 11;
     cinfo->_min_DCT_v_scaled_size = 11;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 12) {
     /* Provide 12/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 12L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 12L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 12L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 12L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 12;
     cinfo->_min_DCT_v_scaled_size = 12;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 13) {
     /* Provide 13/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 13L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 13L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 13L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 13L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 13;
     cinfo->_min_DCT_v_scaled_size = 13;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 14) {
     /* Provide 14/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 14L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 14L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 14L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 14L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 14;
     cinfo->_min_DCT_v_scaled_size = 14;
   } else if (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * 15) {
     /* Provide 15/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 15L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 15L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 15L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 15L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 15;
     cinfo->_min_DCT_v_scaled_size = 15;
   } else {
     /* Provide 16/block_size scaling */
     cinfo->output_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width * 16L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_width * 16L, (long)DCTSIZE);
     cinfo->output_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height * 16L, (long) DCTSIZE);
+      jdiv_round_up((long)cinfo->image_height * 16L, (long)DCTSIZE);
     cinfo->_min_DCT_h_scaled_size = 16;
     cinfo->_min_DCT_v_scaled_size = 16;
   }
@@ -268,7 +256,7 @@ jpeg_core_output_dimensions (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
+jpeg_calc_output_dimensions(j_decompress_ptr cinfo)
 /* Do computations that are needed before master selection phase */
 {
 #ifdef IDCT_SCALING_SUPPORTED
@@ -314,13 +302,13 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
        ci++, compptr++) {
     /* Size in samples, after IDCT scaling */
     compptr->downsampled_width = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_width *
-                    (long) (compptr->h_samp_factor * compptr->_DCT_scaled_size),
-                    (long) (cinfo->max_h_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_width *
+                    (long)(compptr->h_samp_factor * compptr->_DCT_scaled_size),
+                    (long)(cinfo->max_h_samp_factor * DCTSIZE));
     compptr->downsampled_height = (JDIMENSION)
-      jdiv_round_up((long) cinfo->image_height *
-                    (long) (compptr->v_samp_factor * compptr->_DCT_scaled_size),
-                    (long) (cinfo->max_v_samp_factor * DCTSIZE));
+      jdiv_round_up((long)cinfo->image_height *
+                    (long)(compptr->v_samp_factor * compptr->_DCT_scaled_size),
+                    (long)(cinfo->max_v_samp_factor * DCTSIZE));
   }
 
 #else /* !IDCT_SCALING_SUPPORTED */
@@ -417,31 +405,31 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-prepare_range_limit_table (j_decompress_ptr cinfo)
+prepare_range_limit_table(j_decompress_ptr cinfo)
 /* Allocate and fill in the sample_range_limit table */
 {
   JSAMPLE *table;
   int i;
 
   table = (JSAMPLE *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                (5 * (MAXJSAMPLE+1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
-  table += (MAXJSAMPLE+1);      /* allow negative subscripts of simple table */
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                (5 * (MAXJSAMPLE + 1) + CENTERJSAMPLE) * sizeof(JSAMPLE));
+  table += (MAXJSAMPLE + 1);    /* allow negative subscripts of simple table */
   cinfo->sample_range_limit = table;
   /* First segment of "simple" table: limit[x] = 0 for x < 0 */
-  MEMZERO(table - (MAXJSAMPLE+1), (MAXJSAMPLE+1) * sizeof(JSAMPLE));
+  memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
   /* Main part of "simple" table: limit[x] = x */
   for (i = 0; i <= MAXJSAMPLE; i++)
-    table[i] = (JSAMPLE) i;
+    table[i] = (JSAMPLE)i;
   table += CENTERJSAMPLE;       /* Point to where post-IDCT table starts */
   /* End of simple table, rest of first half of post-IDCT table */
-  for (i = CENTERJSAMPLE; i < 2*(MAXJSAMPLE+1); i++)
+  for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
     table[i] = MAXJSAMPLE;
   /* Second half of post-IDCT table */
-  MEMZERO(table + (2 * (MAXJSAMPLE+1)),
-          (2 * (MAXJSAMPLE+1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
-  MEMCOPY(table + (4 * (MAXJSAMPLE+1) - CENTERJSAMPLE),
-          cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
+  memset(table + (2 * (MAXJSAMPLE + 1)), 0,
+         (2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
+  memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
+         cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
 }
 
 
@@ -457,9 +445,9 @@ prepare_range_limit_table (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-master_selection (j_decompress_ptr cinfo)
+master_selection(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
   boolean use_c_buffer;
   long samplesperrow;
   JDIMENSION jd_samplesperrow;
@@ -469,9 +457,10 @@ master_selection (j_decompress_ptr cinfo)
   prepare_range_limit_table(cinfo);
 
   /* Width of an output scanline must be representable as JDIMENSION. */
-  samplesperrow = (long) cinfo->output_width * (long) cinfo->out_color_components;
-  jd_samplesperrow = (JDIMENSION) samplesperrow;
-  if ((long) jd_samplesperrow != samplesperrow)
+  samplesperrow = (long)cinfo->output_width *
+                  (long)cinfo->out_color_components;
+  jd_samplesperrow = (JDIMENSION)samplesperrow;
+  if ((long)jd_samplesperrow != samplesperrow)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
 
   /* Initialize my private state */
@@ -482,7 +471,7 @@ master_selection (j_decompress_ptr cinfo)
   master->quantizer_1pass = NULL;
   master->quantizer_2pass = NULL;
   /* No mode changes if not using buffered-image mode. */
-  if (! cinfo->quantize_colors || ! cinfo->buffered_image) {
+  if (!cinfo->quantize_colors || !cinfo->buffered_image) {
     cinfo->enable_1pass_quant = FALSE;
     cinfo->enable_external_quant = FALSE;
     cinfo->enable_2pass_quant = FALSE;
@@ -528,7 +517,7 @@ master_selection (j_decompress_ptr cinfo)
   }
 
   /* Post-processing: in particular, color conversion first */
-  if (! cinfo->raw_data_out) {
+  if (!cinfo->raw_data_out) {
     if (master->using_merged_upsample) {
 #ifdef UPSAMPLE_MERGING_SUPPORTED
       jinit_merged_upsampler(cinfo); /* does color conversion too */
@@ -565,11 +554,11 @@ master_selection (j_decompress_ptr cinfo)
   use_c_buffer = cinfo->inputctl->has_multiple_scans || cinfo->buffered_image;
   jinit_d_coef_controller(cinfo, use_c_buffer);
 
-  if (! cinfo->raw_data_out)
+  if (!cinfo->raw_data_out)
     jinit_d_main_controller(cinfo, FALSE /* never need full buffer here */);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
@@ -579,13 +568,14 @@ master_selection (j_decompress_ptr cinfo)
    */
   cinfo->master->first_iMCU_col = 0;
   cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+  cinfo->master->last_good_iMCU_row = 0;
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
    * progress monitoring appropriately.  The input step is counted
    * as one pass.
    */
-  if (cinfo->progress != NULL && ! cinfo->buffered_image &&
+  if (cinfo->progress != NULL && !cinfo->buffered_image &&
       cinfo->inputctl->has_multiple_scans) {
     int nscans;
     /* Estimate number of scans to set pass_limit. */
@@ -597,7 +587,7 @@ master_selection (j_decompress_ptr cinfo)
       nscans = cinfo->num_components;
     }
     cinfo->progress->pass_counter = 0L;
-    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
     cinfo->progress->completed_passes = 0;
     cinfo->progress->total_passes = (cinfo->enable_2pass_quant ? 3 : 2);
     /* Count the input pass as done */
@@ -617,9 +607,9 @@ master_selection (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-prepare_for_output_pass (j_decompress_ptr cinfo)
+prepare_for_output_pass(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (master->pub.is_dummy_pass) {
 #ifdef QUANT_2PASS_SUPPORTED
@@ -645,8 +635,8 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
     }
     (*cinfo->idct->start_pass) (cinfo);
     (*cinfo->coef->start_output_pass) (cinfo);
-    if (! cinfo->raw_data_out) {
-      if (! master->using_merged_upsample)
+    if (!cinfo->raw_data_out) {
+      if (!master->using_merged_upsample)
         (*cinfo->cconvert->start_pass) (cinfo);
       (*cinfo->upsample->start_pass) (cinfo);
       if (cinfo->quantize_colors)
@@ -665,7 +655,7 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
     /* In buffered-image mode, we assume one more output pass if EOI not
      * yet reached, but no more passes if EOI has been reached.
      */
-    if (cinfo->buffered_image && ! cinfo->inputctl->eoi_reached) {
+    if (cinfo->buffered_image && !cinfo->inputctl->eoi_reached) {
       cinfo->progress->total_passes += (cinfo->enable_2pass_quant ? 2 : 1);
     }
   }
@@ -677,9 +667,9 @@ prepare_for_output_pass (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_output_pass (j_decompress_ptr cinfo)
+finish_output_pass(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   if (cinfo->quantize_colors)
     (*cinfo->cquantize->finish_pass) (cinfo);
@@ -694,9 +684,9 @@ finish_output_pass (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jpeg_new_colormap (j_decompress_ptr cinfo)
+jpeg_new_colormap(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   /* Prevent application from calling me at wrong times */
   if (cinfo->global_state != DSTATE_BUFIMAGE)
@@ -722,9 +712,9 @@ jpeg_new_colormap (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_master_decompress (j_decompress_ptr cinfo)
+jinit_master_decompress(j_decompress_ptr cinfo)
 {
-  my_master_ptr master = (my_master_ptr) cinfo->master;
+  my_master_ptr master = (my_master_ptr)cinfo->master;
 
   master->pub.prepare_for_output_pass = prepare_for_output_pass;
   master->pub.finish_output_pass = finish_output_pass;
diff --git a/media/libjpeg/jdmerge.c b/media/libjpeg/jdmerge.c
index 6276dd0950..3a456d6581 100644
--- a/media/libjpeg/jdmerge.c
+++ b/media/libjpeg/jdmerge.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2020, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -40,44 +40,16 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jdmerge.h"
 #include "jsimd.h"
 #include "jconfigint.h"
 
 #ifdef UPSAMPLE_MERGING_SUPPORTED
 
 
-/* Private subobject */
-
-typedef struct {
-  struct jpeg_upsampler pub;    /* public fields */
-
-  /* Pointer to routine to do actual upsampling/conversion of one row group */
-  void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-                    JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-
-  /* Private state for YCC->RGB conversion */
-  int *Cr_r_tab;                /* => table for Cr to R conversion */
-  int *Cb_b_tab;                /* => table for Cb to B conversion */
-  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
-  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
-
-  /* For 2:1 vertical sampling, we produce two output rows at a time.
-   * We need a "spare" row buffer to hold the second output row if the
-   * application provides just a one-row buffer; we also use the spare
-   * to discard the dummy last row if the image height is odd.
-   */
-  JSAMPROW spare_row;
-  boolean spare_full;           /* T if spare buffer is occupied */
-
-  JDIMENSION out_row_width;     /* samples per output row */
-  JDIMENSION rows_to_go;        /* counts rows remaining in image */
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
 #define SCALEBITS       16      /* speediest right-shift on some machines */
-#define ONE_HALF        ((JLONG) 1 << (SCALEBITS-1))
-#define FIX(x)          ((JLONG) ((x) * (1L<<SCALEBITS) + 0.5))
+#define ONE_HALF        ((JLONG)1 << (SCALEBITS - 1))
+#define FIX(x)          ((JLONG)((x) * (1L << SCALEBITS) + 0.5))
 
 
 /* Include inline routines for colorspace extensions */
@@ -88,12 +60,12 @@ typedef my_upsampler *my_upsample_ptr;
 #undef RGB_BLUE
 #undef RGB_PIXELSIZE
 
-#define RGB_RED EXT_RGB_RED
-#define RGB_GREEN EXT_RGB_GREEN
-#define RGB_BLUE EXT_RGB_BLUE
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define h2v1_merged_upsample_internal extrgb_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extrgb_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define h2v1_merged_upsample_internal  extrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extrgb_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -102,13 +74,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_RGBX_RED
-#define RGB_GREEN EXT_RGBX_GREEN
-#define RGB_BLUE EXT_RGBX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define h2v1_merged_upsample_internal extrgbx_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extrgbx_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define h2v1_merged_upsample_internal  extrgbx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extrgbx_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -118,12 +90,12 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_BGR_RED
-#define RGB_GREEN EXT_BGR_GREEN
-#define RGB_BLUE EXT_BGR_BLUE
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define h2v1_merged_upsample_internal extbgr_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extbgr_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define h2v1_merged_upsample_internal  extbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extbgr_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -132,13 +104,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_BGRX_RED
-#define RGB_GREEN EXT_BGRX_GREEN
-#define RGB_BLUE EXT_BGRX_BLUE
-#define RGB_ALPHA 3
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define h2v1_merged_upsample_internal extbgrx_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extbgrx_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define h2v1_merged_upsample_internal  extbgrx_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extbgrx_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -148,13 +120,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_XBGR_RED
-#define RGB_GREEN EXT_XBGR_GREEN
-#define RGB_BLUE EXT_XBGR_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define h2v1_merged_upsample_internal extxbgr_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extxbgr_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define h2v1_merged_upsample_internal  extxbgr_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extxbgr_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -164,13 +136,13 @@ typedef my_upsampler *my_upsample_ptr;
 #undef h2v1_merged_upsample_internal
 #undef h2v2_merged_upsample_internal
 
-#define RGB_RED EXT_XRGB_RED
-#define RGB_GREEN EXT_XRGB_GREEN
-#define RGB_BLUE EXT_XRGB_BLUE
-#define RGB_ALPHA 0
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define h2v1_merged_upsample_internal extxrgb_h2v1_merged_upsample_internal
-#define h2v2_merged_upsample_internal extxrgb_h2v2_merged_upsample_internal
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define h2v1_merged_upsample_internal  extxrgb_h2v1_merged_upsample_internal
+#define h2v2_merged_upsample_internal  extxrgb_h2v2_merged_upsample_internal
 #include "jdmrgext.c"
 #undef RGB_RED
 #undef RGB_GREEN
@@ -187,25 +159,25 @@ typedef my_upsampler *my_upsample_ptr;
  */
 
 LOCAL(void)
-build_ycc_rgb_table (j_decompress_ptr cinfo)
+build_ycc_rgb_table(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   int i;
   JLONG x;
   SHIFT_TEMPS
 
   upsample->Cr_r_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cb_b_tab = (int *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(int));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(int));
   upsample->Cr_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
   upsample->Cb_g_tab = (JLONG *)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                (MAXJSAMPLE+1) * sizeof(JLONG));
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                (MAXJSAMPLE + 1) * sizeof(JLONG));
 
   for (i = 0, x = -CENTERJSAMPLE; i <= MAXJSAMPLE; i++, x++) {
     /* i is the actual input pixel value, in the range 0..MAXJSAMPLE */
@@ -217,10 +189,10 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
     upsample->Cb_b_tab[i] = (int)
                     RIGHT_SHIFT(FIX(1.77200) * x + ONE_HALF, SCALEBITS);
     /* Cr=>G value is scaled-up -0.71414 * x */
-    upsample->Cr_g_tab[i] = (- FIX(0.71414)) * x;
+    upsample->Cr_g_tab[i] = (-FIX(0.71414)) * x;
     /* Cb=>G value is scaled-up -0.34414 * x */
     /* We also add in ONE_HALF so that need not do it in inner loop */
-    upsample->Cb_g_tab[i] = (- FIX(0.34414)) * x + ONE_HALF;
+    upsample->Cb_g_tab[i] = (-FIX(0.34414)) * x + ONE_HALF;
   }
 }
 
@@ -230,9 +202,9 @@ build_ycc_rgb_table (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_merged_upsample (j_decompress_ptr cinfo)
+start_pass_merged_upsample(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
 
   /* Mark the spare buffer empty */
   upsample->spare_full = FALSE;
@@ -248,14 +220,13 @@ start_pass_merged_upsample (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-merged_2v_upsample (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+merged_2v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 2:1 vertical sampling case: may need a spare row. */
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   JSAMPROW work_ptrs[2];
   JDIMENSION num_rows;          /* number of rows returned to caller */
 
@@ -264,8 +235,8 @@ merged_2v_upsample (j_decompress_ptr cinfo,
     JDIMENSION size = upsample->out_row_width;
     if (cinfo->out_color_space == JCS_RGB565)
       size = cinfo->output_width * 2;
-    jcopy_sample_rows(& upsample->spare_row, 0, output_buf + *out_row_ctr, 0,
-                      1, size);
+    jcopy_sample_rows(&upsample->spare_row, 0, output_buf + *out_row_ctr, 0, 1,
+                      size);
     num_rows = 1;
     upsample->spare_full = FALSE;
   } else {
@@ -294,20 +265,19 @@ merged_2v_upsample (j_decompress_ptr cinfo,
   *out_row_ctr += num_rows;
   upsample->rows_to_go -= num_rows;
   /* When the buffer is emptied, declare this input row group consumed */
-  if (! upsample->spare_full)
+  if (!upsample->spare_full)
     (*in_row_group_ctr)++;
 }
 
 
 METHODDEF(void)
-merged_1v_upsample (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+merged_1v_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 /* 1:1 vertical sampling case: much easier, never need a spare row. */
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
 
   /* Just do the upsampling. */
   (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr,
@@ -333,43 +303,42 @@ merged_1v_upsample (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-h2v1_merged_upsample (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                      JSAMPARRAY output_buf)
+h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    default:
-      h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                    output_buf);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  default:
+    h2v1_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                  output_buf);
+    break;
   }
 }
 
@@ -379,43 +348,42 @@ h2v1_merged_upsample (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-h2v2_merged_upsample (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                      JSAMPARRAY output_buf)
+h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   switch (cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_BGR:
-      extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                           output_buf);
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                            output_buf);
-      break;
-    default:
-      h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
-                                    output_buf);
-      break;
+  case JCS_EXT_RGB:
+    extrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    extrgbx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_BGR:
+    extbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                         output_buf);
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    extbgrx_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    extxbgr_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    extxrgb_h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                          output_buf);
+    break;
+  default:
+    h2v2_merged_upsample_internal(cinfo, input_buf, in_row_group_ctr,
+                                  output_buf);
+    break;
   }
 }
 
@@ -424,24 +392,21 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
  * RGB565 conversion
  */
 
-#define PACK_SHORT_565_LE(r, g, b)   ((((r) << 8) & 0xF800) |  \
-                                      (((g) << 3) & 0x7E0) | ((b) >> 3))
-#define PACK_SHORT_565_BE(r, g, b)   (((r) & 0xF8) | ((g) >> 5) |  \
-                                      (((g) << 11) & 0xE000) |  \
-                                      (((b) << 5) & 0x1F00))
+#define PACK_SHORT_565_LE(r, g, b) \
+  ((((r) << 8) & 0xF800) | (((g) << 3) & 0x7E0) | ((b) >> 3))
+#define PACK_SHORT_565_BE(r, g, b) \
+  (((r) & 0xF8) | ((g) >> 5) | (((g) << 11) & 0xE000) | (((b) << 5) & 0x1F00))
 
-#define PACK_TWO_PIXELS_LE(l, r)     ((r << 16) | l)
-#define PACK_TWO_PIXELS_BE(l, r)     ((l << 16) | r)
+#define PACK_TWO_PIXELS_LE(l, r)    ((r << 16) | l)
+#define PACK_TWO_PIXELS_BE(l, r)    ((l << 16) | r)
 
-#define PACK_NEED_ALIGNMENT(ptr)  (((size_t)(ptr)) & 3)
-
-#define WRITE_TWO_PIXELS_LE(addr, pixels) {  \
-  ((INT16*)(addr))[0] = (INT16)(pixels);  \
-  ((INT16*)(addr))[1] = (INT16)((pixels) >> 16);  \
+#define WRITE_TWO_PIXELS_LE(addr, pixels) { \
+  ((INT16 *)(addr))[0] = (INT16)(pixels); \
+  ((INT16 *)(addr))[1] = (INT16)((pixels) >> 16); \
 }
-#define WRITE_TWO_PIXELS_BE(addr, pixels) {  \
-  ((INT16*)(addr))[1] = (INT16)(pixels);  \
-  ((INT16*)(addr))[0] = (INT16)((pixels) >> 16);  \
+#define WRITE_TWO_PIXELS_BE(addr, pixels) { \
+  ((INT16 *)(addr))[1] = (INT16)(pixels); \
+  ((INT16 *)(addr))[0] = (INT16)((pixels) >> 16); \
 }
 
 #define DITHER_565_R(r, dither)  ((r) + ((dither) & 0xFF))
@@ -452,7 +417,7 @@ h2v2_merged_upsample (j_decompress_ptr cinfo,
 /* Declarations for ordered dithering
  *
  * We use a 4x4 ordered dither array packed into 32 bits.  This array is
- * sufficent for dithering RGB888 to RGB565.
+ * sufficient for dithering RGB888 to RGB565.
  */
 
 #define DITHER_MASK       0x3
@@ -467,13 +432,13 @@ static const JLONG dither_matrix[4] = {
 
 /* Include inline routines for RGB565 conversion */
 
-#define PACK_SHORT_565 PACK_SHORT_565_LE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_LE
-#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_LE
-#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_le
-#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_le
-#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_le
-#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_le
+#define PACK_SHORT_565  PACK_SHORT_565_LE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_LE
+#define WRITE_TWO_PIXELS  WRITE_TWO_PIXELS_LE
+#define h2v1_merged_upsample_565_internal  h2v1_merged_upsample_565_le
+#define h2v1_merged_upsample_565D_internal  h2v1_merged_upsample_565D_le
+#define h2v2_merged_upsample_565_internal  h2v2_merged_upsample_565_le
+#define h2v2_merged_upsample_565D_internal  h2v2_merged_upsample_565D_le
 #include "jdmrg565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -483,13 +448,13 @@ static const JLONG dither_matrix[4] = {
 #undef h2v2_merged_upsample_565_internal
 #undef h2v2_merged_upsample_565D_internal
 
-#define PACK_SHORT_565 PACK_SHORT_565_BE
-#define PACK_TWO_PIXELS PACK_TWO_PIXELS_BE
-#define WRITE_TWO_PIXELS WRITE_TWO_PIXELS_BE
-#define h2v1_merged_upsample_565_internal h2v1_merged_upsample_565_be
-#define h2v1_merged_upsample_565D_internal h2v1_merged_upsample_565D_be
-#define h2v2_merged_upsample_565_internal h2v2_merged_upsample_565_be
-#define h2v2_merged_upsample_565D_internal h2v2_merged_upsample_565D_be
+#define PACK_SHORT_565  PACK_SHORT_565_BE
+#define PACK_TWO_PIXELS  PACK_TWO_PIXELS_BE
+#define WRITE_TWO_PIXELS  WRITE_TWO_PIXELS_BE
+#define h2v1_merged_upsample_565_internal  h2v1_merged_upsample_565_be
+#define h2v1_merged_upsample_565D_internal  h2v1_merged_upsample_565D_be
+#define h2v2_merged_upsample_565_internal  h2v2_merged_upsample_565_be
+#define h2v2_merged_upsample_565D_internal  h2v2_merged_upsample_565D_be
 #include "jdmrg565.c"
 #undef PACK_SHORT_565
 #undef PACK_TWO_PIXELS
@@ -503,16 +468,15 @@ static const JLONG dither_matrix[4] = {
 static INLINE boolean is_big_endian(void)
 {
   int test_value = 1;
-  if(*(char *)&test_value != 1)
+  if (*(char *)&test_value != 1)
     return TRUE;
   return FALSE;
 }
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                          JSAMPARRAY output_buf)
+h2v1_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -520,13 +484,12 @@ h2v1_merged_upsample_565 (j_decompress_ptr cinfo,
   else
     h2v1_merged_upsample_565_le(cinfo, input_buf, in_row_group_ctr,
                                 output_buf);
- }
+}
 
 
 METHODDEF(void)
-h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                           JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v1_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -538,9 +501,8 @@ h2v1_merged_upsample_565D (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                          JSAMPARRAY output_buf)
+h2v2_merged_upsample_565(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565_be(cinfo, input_buf, in_row_group_ctr,
@@ -552,9 +514,8 @@ h2v2_merged_upsample_565 (j_decompress_ptr cinfo,
 
 
 METHODDEF(void)
-h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
-                           JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
-                           JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                          JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
   if (is_big_endian())
     h2v2_merged_upsample_565D_be(cinfo, input_buf, in_row_group_ctr,
@@ -574,14 +535,14 @@ h2v2_merged_upsample_565D (j_decompress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_merged_upsampler (j_decompress_ptr cinfo)
+jinit_merged_upsampler(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample;
+  my_merged_upsample_ptr upsample;
 
-  upsample = (my_upsample_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                sizeof(my_upsampler));
-  cinfo->upsample = (struct jpeg_upsampler *) upsample;
+  upsample = (my_merged_upsample_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(my_merged_upsampler));
+  cinfo->upsample = (struct jpeg_upsampler *)upsample;
   upsample->pub.start_pass = start_pass_merged_upsample;
   upsample->pub.need_context_rows = FALSE;
 
@@ -602,8 +563,8 @@ jinit_merged_upsampler (j_decompress_ptr cinfo)
     }
     /* Allocate a spare row buffer */
     upsample->spare_row = (JSAMPROW)
-      (*cinfo->mem->alloc_large) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                (size_t) (upsample->out_row_width * sizeof(JSAMPLE)));
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                (size_t)(upsample->out_row_width * sizeof(JSAMPLE)));
   } else {
     upsample->pub.upsample = merged_1v_upsample;
     if (jsimd_can_h2v1_merged_upsample())
diff --git a/media/libjpeg/jdmerge.h b/media/libjpeg/jdmerge.h
new file mode 100644
index 0000000000..b583396b10
--- /dev/null
+++ b/media/libjpeg/jdmerge.h
@@ -0,0 +1,47 @@
+/*
+ * jdmerge.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1994-1996, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+#define JPEG_INTERNALS
+#include "jpeglib.h"
+
+#ifdef UPSAMPLE_MERGING_SUPPORTED
+
+
+/* Private subobject */
+
+typedef struct {
+  struct jpeg_upsampler pub;    /* public fields */
+
+  /* Pointer to routine to do actual upsampling/conversion of one row group */
+  void (*upmethod) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                    JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+
+  /* Private state for YCC->RGB conversion */
+  int *Cr_r_tab;                /* => table for Cr to R conversion */
+  int *Cb_b_tab;                /* => table for Cb to B conversion */
+  JLONG *Cr_g_tab;              /* => table for Cr to G conversion */
+  JLONG *Cb_g_tab;              /* => table for Cb to G conversion */
+
+  /* For 2:1 vertical sampling, we produce two output rows at a time.
+   * We need a "spare" row buffer to hold the second output row if the
+   * application provides just a one-row buffer; we also use the spare
+   * to discard the dummy last row if the image height is odd.
+   */
+  JSAMPROW spare_row;
+  boolean spare_full;           /* T if spare buffer is occupied */
+
+  JDIMENSION out_row_width;     /* samples per output row */
+  JDIMENSION rows_to_go;        /* counts rows remaining in image */
+} my_merged_upsampler;
+
+typedef my_merged_upsampler *my_merged_upsample_ptr;
+
+#endif /* UPSAMPLE_MERGING_SUPPORTED */
diff --git a/media/libjpeg/jdmrg565.c b/media/libjpeg/jdmrg565.c
index 18287b3735..980a4e216e 100644
--- a/media/libjpeg/jdmrg565.c
+++ b/media/libjpeg/jdmrg565.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2013, Linaro Limited.
- * Copyright (C) 2014-2015, D. R. Commander.
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,23 +15,22 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
-                                   JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                                  JDIMENSION in_row_group_ctr,
+                                  JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
@@ -44,20 +43,20 @@ h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -69,40 +68,40 @@ h2v1_merged_upsample_565_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = (INT16)rgb;
-   }
- }
+    *(INT16 *)outptr = (INT16)rgb;
+  }
+}
 
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
-                                    JSAMPIMAGE input_buf,
-                                    JDIMENSION in_row_group_ctr,
-                                    JSAMPARRAY output_buf)
+h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
   unsigned int r, g, b;
   JLONG rgb;
@@ -116,21 +115,21 @@ h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -143,40 +142,39 @@ h2v1_merged_upsample_565D_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr = (INT16)rgb;
+    *(INT16 *)outptr = (INT16)rgb;
   }
 }
 
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
-                                   JSAMPIMAGE input_buf,
-                                   JDIMENSION in_row_group_ctr,
-                                   JSAMPARRAY output_buf)
+h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                                  JDIMENSION in_row_group_ctr,
+                                  JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
@@ -191,20 +189,20 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -213,13 +211,13 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -231,56 +229,56 @@ h2v2_merged_upsample_565_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = (INT16)rgb;
+    *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = (INT16)rgb;
+    *(INT16 *)outptr1 = (INT16)rgb;
   }
 }
 
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
-                                    JSAMPIMAGE input_buf,
-                                    JDIMENSION in_row_group_ctr,
-                                    JSAMPARRAY output_buf)
+h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION in_row_group_ctr,
+                                   JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   JLONG d0 = dither_matrix[cinfo->output_scanline & DITHER_MASK];
-  JLONG d1 = dither_matrix[(cinfo->output_scanline+1) & DITHER_MASK];
+  JLONG d1 = dither_matrix[(cinfo->output_scanline + 1) & DITHER_MASK];
   unsigned int r, g, b;
   JLONG rgb;
   SHIFT_TEMPS
 
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr00 = input_buf[0][in_row_group_ctr * 2];
+  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
   inptr1 = input_buf[1][in_row_group_ctr];
   inptr2 = input_buf[2][in_row_group_ctr];
   outptr0 = output_buf[0];
@@ -289,38 +287,38 @@ h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
-    r = range_limit[DITHER_565_R(y + cred, d1)];
-    g = range_limit[DITHER_565_G(y + cgreen, d1)];
-    b = range_limit[DITHER_565_B(y + cblue, d1)];
-    d1 = DITHER_ROTATE(d1);
+    y  = *inptr00++;
+    r = range_limit[DITHER_565_R(y + cred, d0)];
+    g = range_limit[DITHER_565_G(y + cgreen, d0)];
+    b = range_limit[DITHER_565_B(y + cblue, d0)];
+    d0 = DITHER_ROTATE(d0);
     rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
-    r = range_limit[DITHER_565_R(y + cred, d0)];
-    g = range_limit[DITHER_565_G(y + cgreen, d0)];
-    b = range_limit[DITHER_565_B(y + cblue, d0)];
-    d0 = DITHER_ROTATE(d0);
+    y  = *inptr01++;
+    r = range_limit[DITHER_565_R(y + cred, d1)];
+    g = range_limit[DITHER_565_G(y + cgreen, d1)];
+    b = range_limit[DITHER_565_B(y + cblue, d1)];
+    d1 = DITHER_ROTATE(d1);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
@@ -333,24 +331,24 @@ h2v2_merged_upsample_565D_internal (j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr0 = (INT16)rgb;
+    *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     rgb = PACK_SHORT_565(r, g, b);
-    *(INT16*)outptr1 = (INT16)rgb;
+    *(INT16 *)outptr1 = (INT16)rgb;
   }
 }
diff --git a/media/libjpeg/jdmrgext.c b/media/libjpeg/jdmrgext.c
index 9d7d2af2e9..9bf4f1a307 100644
--- a/media/libjpeg/jdmrgext.c
+++ b/media/libjpeg/jdmrgext.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -21,23 +21,22 @@
 
 INLINE
 LOCAL(void)
-h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf,
-                               JDIMENSION in_row_group_ctr,
-                               JSAMPARRAY output_buf)
+h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION in_row_group_ctr,
+                              JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr;
   JSAMPROW inptr0, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -47,13 +46,13 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -61,7 +60,7 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr[RGB_ALPHA] = 0xFF;
 #endif
     outptr += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -72,12 +71,12 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -94,27 +93,26 @@ h2v1_merged_upsample_internal (j_decompress_ptr cinfo,
 
 INLINE
 LOCAL(void)
-h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
-                               JSAMPIMAGE input_buf,
-                               JDIMENSION in_row_group_ctr,
-                               JSAMPARRAY output_buf)
+h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                              JDIMENSION in_row_group_ctr,
+                              JSAMPARRAY output_buf)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
   register int y, cred, cgreen, cblue;
   int cb, cr;
   register JSAMPROW outptr0, outptr1;
   JSAMPROW inptr00, inptr01, inptr1, inptr2;
   JDIMENSION col;
   /* copy these pointers into registers if possible */
-  register JSAMPLE * range_limit = cinfo->sample_range_limit;
-  int * Crrtab = upsample->Cr_r_tab;
-  int * Cbbtab = upsample->Cb_b_tab;
-  JLONG * Crgtab = upsample->Cr_g_tab;
-  JLONG * Cbgtab = upsample->Cb_g_tab;
+  register JSAMPLE *range_limit = cinfo->sample_range_limit;
+  int *Crrtab = upsample->Cr_r_tab;
+  int *Cbbtab = upsample->Cb_b_tab;
+  JLONG *Crgtab = upsample->Cr_g_tab;
+  JLONG *Cbgtab = upsample->Cb_g_tab;
   SHIFT_TEMPS
 
-  inptr00 = input_buf[0][in_row_group_ctr*2];
-  inptr01 = input_buf[0][in_row_group_ctr*2 + 1];
+  inptr00 = input_buf[0][in_row_group_ctr * 2];
+  inptr01 = input_buf[0][in_row_group_ctr * 2 + 1];
   inptr1 = input_buf[1][in_row_group_ctr];
   inptr2 = input_buf[2][in_row_group_ctr];
   outptr0 = output_buf[0];
@@ -122,13 +120,13 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -136,7 +134,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -144,7 +142,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -152,7 +150,7 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
     outptr1[RGB_ALPHA] = 0xFF;
 #endif
     outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -163,19 +161,19 @@ h2v2_merged_upsample_internal (j_decompress_ptr cinfo,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
-    cgreen = (int) RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
+    cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
diff --git a/media/libjpeg/jdphuff.c b/media/libjpeg/jdphuff.c
index c927ffa071..9680ebcbd0 100644
--- a/media/libjpeg/jdphuff.c
+++ b/media/libjpeg/jdphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,12 +15,16 @@
  * up to the start of the current MCU.  To do this, we copy state variables
  * into local working storage, and update them back to the permanent
  * storage only upon successful completion of an MCU.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
  */
 
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jdhuff.h"             /* Declarations shared with jdhuff.c */
+#include <limits.h>
 
 
 #ifdef D_PROGRESSIVE_SUPPORTED
@@ -37,25 +41,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest,src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest,src)  \
-        ((dest).EOBRUN = (src).EOBRUN, \
-         (dest).last_dc_val[0] = (src).last_dc_val[0], \
-         (dest).last_dc_val[1] = (src).last_dc_val[1], \
-         (dest).last_dc_val[2] = (src).last_dc_val[2], \
-         (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -77,14 +62,14 @@ typedef struct {
 typedef phuff_entropy_decoder *phuff_entropy_ptr;
 
 /* Forward declarations */
-METHODDEF(boolean) decode_mcu_DC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_DC_first(j_decompress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_AC_first(j_decompress_ptr cinfo,
+                                       JBLOCKROW *MCU_data);
+METHODDEF(boolean) decode_mcu_DC_refine(j_decompress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_first (j_decompress_ptr cinfo,
+METHODDEF(boolean) decode_mcu_AC_refine(j_decompress_ptr cinfo,
                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_DC_refine (j_decompress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
-METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
-                                         JBLOCKROW *MCU_data);
 
 
 /*
@@ -92,13 +77,13 @@ METHODDEF(boolean) decode_mcu_AC_refine (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-start_pass_phuff_decoder (j_decompress_ptr cinfo)
+start_pass_phuff_decoder(j_decompress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
-  int *coef_bit_ptr;
+  int *coef_bit_ptr, *prev_coef_bit_ptr;
   jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
@@ -118,7 +103,7 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
   }
   if (cinfo->Ah != 0) {
     /* Successive approximation refinement scan: must have Al = Ah-1. */
-    if (cinfo->Al != cinfo->Ah-1)
+    if (cinfo->Al != cinfo->Ah - 1)
       bad = TRUE;
   }
   if (cinfo->Al > 13)           /* need not check for < 0 */
@@ -138,9 +123,16 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
    */
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     int cindex = cinfo->cur_comp_info[ci]->component_index;
-    coef_bit_ptr = & cinfo->coef_bits[cindex][0];
+    coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+    prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
     if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
       WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+    for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+      else
+        prev_coef_bit_ptr[coefi] = 0;
+    }
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
       int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
       if (cinfo->Ah != expected)
@@ -205,22 +197,26 @@ start_pass_phuff_decoder (j_decompress_ptr cinfo)
 #define AVOID_TABLES
 #ifdef AVOID_TABLES
 
-#define NEG_1 ((unsigned)-1)
-#define HUFF_EXTEND(x,s)  ((x) < (1<<((s)-1)) ? (x) + (((NEG_1)<<(s)) + 1) : (x))
+#define NEG_1  ((unsigned)-1)
+#define HUFF_EXTEND(x, s) \
+  ((x) < (1 << ((s) - 1)) ? (x) + (((NEG_1) << (s)) + 1) : (x))
 
 #else
 
-#define HUFF_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
+#define HUFF_EXTEND(x, s) \
+  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))
 
-static const int extend_test[16] =   /* entry n is 2**(n-1) */
-  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
-    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int extend_test[16] = {   /* entry n is 2**(n-1) */
+  0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
+  0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000
+};
 
-static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
-  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
-    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
-    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
-    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int extend_offset[16] = { /* entry n is (-1 << n) + 1 */
+  0, ((-1) << 1) + 1, ((-1) << 2) + 1, ((-1) << 3) + 1, ((-1) << 4) + 1,
+  ((-1) << 5) + 1, ((-1) << 6) + 1, ((-1) << 7) + 1, ((-1) << 8) + 1,
+  ((-1) << 9) + 1, ((-1) << 10) + 1, ((-1) << 11) + 1, ((-1) << 12) + 1,
+  ((-1) << 13) + 1, ((-1) << 14) + 1, ((-1) << 15) + 1
+};
 
 #endif /* AVOID_TABLES */
 
@@ -231,9 +227,9 @@ static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
  */
 
 LOCAL(boolean)
-process_restart (j_decompress_ptr cinfo)
+process_restart(j_decompress_ptr cinfo)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int ci;
 
   /* Throw away any unused bits remaining in bit buffer; */
@@ -242,7 +238,7 @@ process_restart (j_decompress_ptr cinfo)
   entropy->bitstate.bits_left = 0;
 
   /* Advance past the RSTn marker */
-  if (! (*cinfo->marker->read_restart_marker) (cinfo))
+  if (!(*cinfo->marker->read_restart_marker) (cinfo))
     return FALSE;
 
   /* Re-initialize DC predictions to 0 */
@@ -289,9 +285,9 @@ process_restart (j_decompress_ptr cinfo)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Al = cinfo->Al;
   register int s, r;
   int blkn, ci;
@@ -304,18 +300,18 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
+    BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
+    state = entropy->saved;
 
     /* Outer loop handles each block in the MCU */
 
@@ -336,19 +332,24 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
       }
 
       /* Convert DC difference to actual value, update last_dc_val */
+      if ((state.last_dc_val[ci] >= 0 &&
+           s > INT_MAX - state.last_dc_val[ci]) ||
+          (state.last_dc_val[ci] < 0 && s < INT_MIN - state.last_dc_val[ci]))
+        ERREXIT(cinfo, JERR_BAD_DCT_COEF);
       s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       /* Scale and output the coefficient (assumes jpeg_natural_order[0]=0) */
-      (*block)[0] = (JCOEF) LEFT_SHIFT(s, Al);
+      (*block)[0] = (JCOEF)LEFT_SHIFT(s, Al);
     }
 
     /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
+    BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
+    entropy->saved = state;
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -360,9 +361,9 @@ decode_mcu_DC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Se = cinfo->Se;
   int Al = cinfo->Al;
   register int s, k, r;
@@ -374,14 +375,14 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, just leave the MCU set to zeroes.
    * This way, we return uniform gray for the remainder of the segment.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state.
      * We can avoid loading/saving bitread state if in an EOB run.
@@ -393,7 +394,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (EOBRUN > 0)             /* if it's a band of zeroes... */
       EOBRUN--;                 /* ...process it now (we do nothing) */
     else {
-      BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+      BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
       block = MCU_data[0];
       tbl = entropy->ac_derived_tbl;
 
@@ -407,7 +408,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
           r = GET_BITS(s);
           s = HUFF_EXTEND(r, s);
           /* Scale and output coefficient in natural (dezigzagged) order */
-          (*block)[jpeg_natural_order[k]] = (JCOEF) LEFT_SHIFT(s, Al);
+          (*block)[jpeg_natural_order[k]] = (JCOEF)LEFT_SHIFT(s, Al);
         } else {
           if (r == 15) {        /* ZRL */
             k += 15;            /* skip 15 zeroes in band */
@@ -424,7 +425,7 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         }
       }
 
-      BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+      BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
     }
 
     /* Completed MCU, so update state */
@@ -432,7 +433,8 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -445,9 +447,9 @@ decode_mcu_AC_first (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int p1 = 1 << cinfo->Al;      /* 1 in the bit position being coded */
   int blkn;
   JBLOCKROW block;
@@ -456,7 +458,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
@@ -465,7 +467,7 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
    */
 
   /* Load up working state */
-  BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+  BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
 
   /* Outer loop handles each block in the MCU */
 
@@ -480,10 +482,11 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Completed MCU, so update state */
-  BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+  BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -494,9 +497,9 @@ decode_mcu_DC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
  */
 
 METHODDEF(boolean)
-decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
-  phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+  phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
   int Se = cinfo->Se;
   int p1 = 1 << cinfo->Al;        /* 1 in the bit position being coded */
   int m1 = (NEG_1) << cinfo->Al;  /* -1 in the bit position being coded */
@@ -512,16 +515,16 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Process restart marker if needed; may have to suspend */
   if (cinfo->restart_interval) {
     if (entropy->restarts_to_go == 0)
-      if (! process_restart(cinfo))
+      if (!process_restart(cinfo))
         return FALSE;
   }
 
   /* If we've run out of data, don't modify the MCU.
    */
-  if (! entropy->pub.insufficient_data) {
+  if (!entropy->pub.insufficient_data) {
 
     /* Load up working state */
-    BITREAD_LOAD_STATE(cinfo,entropy->bitstate);
+    BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
     EOBRUN = entropy->saved.EOBRUN; /* only part of saved state we need */
 
     /* There is always only one block per MCU */
@@ -575,9 +578,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
             if (GET_BITS(1)) {
               if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
                 if (*thiscoef >= 0)
-                  *thiscoef += p1;
+                  *thiscoef += (JCOEF)p1;
                 else
-                  *thiscoef += m1;
+                  *thiscoef += (JCOEF)m1;
               }
             }
           } else {
@@ -589,7 +592,7 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
         if (s) {
           int pos = jpeg_natural_order[k];
           /* Output newly nonzero coefficient */
-          (*block)[pos] = (JCOEF) s;
+          (*block)[pos] = (JCOEF)s;
           /* Remember its position in case we have to suspend */
           newnz_pos[num_newnz++] = pos;
         }
@@ -609,9 +612,9 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
           if (GET_BITS(1)) {
             if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
               if (*thiscoef >= 0)
-                *thiscoef += p1;
+                *thiscoef += (JCOEF)p1;
               else
-                *thiscoef += m1;
+                *thiscoef += (JCOEF)m1;
             }
           }
         }
@@ -621,12 +624,13 @@ decode_mcu_AC_refine (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     }
 
     /* Completed MCU, so update state */
-    BITREAD_SAVE_STATE(cinfo,entropy->bitstate);
+    BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
     entropy->saved.EOBRUN = EOBRUN; /* only part of saved state we need */
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 
@@ -644,16 +648,16 @@ undoit:
  */
 
 GLOBAL(void)
-jinit_phuff_decoder (j_decompress_ptr cinfo)
+jinit_phuff_decoder(j_decompress_ptr cinfo)
 {
   phuff_entropy_ptr entropy;
   int *coef_bit_ptr;
   int ci, i;
 
   entropy = (phuff_entropy_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(phuff_entropy_decoder));
-  cinfo->entropy = (struct jpeg_entropy_decoder *) entropy;
+  cinfo->entropy = (struct jpeg_entropy_decoder *)entropy;
   entropy->pub.start_pass = start_pass_phuff_decoder;
 
   /* Mark derived tables unallocated */
@@ -663,9 +667,10 @@ jinit_phuff_decoder (j_decompress_ptr cinfo)
 
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                cinfo->num_components*DCTSIZE2*sizeof(int));
-  coef_bit_ptr = & cinfo->coef_bits[0][0];
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                cinfo->num_components * 2 * DCTSIZE2 *
+                                sizeof(int));
+  coef_bit_ptr = &cinfo->coef_bits[0][0];
   for (ci = 0; ci < cinfo->num_components; ci++)
     for (i = 0; i < DCTSIZE2; i++)
       *coef_bit_ptr++ = -1;
diff --git a/media/libjpeg/jdpostct.c b/media/libjpeg/jdpostct.c
index 601fc2a792..6a2cf5c1b3 100644
--- a/media/libjpeg/jdpostct.c
+++ b/media/libjpeg/jdpostct.c
@@ -46,22 +46,28 @@ typedef my_post_controller *my_post_ptr;
 
 
 /* Forward declarations */
-METHODDEF(void) post_process_1pass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_1pass(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION *in_row_group_ctr,
+                                   JDIMENSION in_row_groups_avail,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION *out_row_ctr,
+                                   JDIMENSION out_rows_avail);
 #ifdef QUANT_2PASS_SUPPORTED
-METHODDEF(void) post_process_prepass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
-METHODDEF(void) post_process_2pass
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
-         JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-         JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_prepass(j_decompress_ptr cinfo,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION *in_row_group_ctr,
+                                     JDIMENSION in_row_groups_avail,
+                                     JSAMPARRAY output_buf,
+                                     JDIMENSION *out_row_ctr,
+                                     JDIMENSION out_rows_avail);
+METHODDEF(void) post_process_2pass(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf,
+                                   JDIMENSION *in_row_group_ctr,
+                                   JDIMENSION in_row_groups_avail,
+                                   JSAMPARRAY output_buf,
+                                   JDIMENSION *out_row_ctr,
+                                   JDIMENSION out_rows_avail);
 #endif
 
 
@@ -70,9 +76,9 @@ METHODDEF(void) post_process_2pass
  */
 
 METHODDEF(void)
-start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
+start_pass_dpost(j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
 
   switch (pass_mode) {
   case JBUF_PASS_THRU:
@@ -85,8 +91,8 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
        */
       if (post->buffer == NULL) {
         post->buffer = (*cinfo->mem->access_virt_sarray)
-          ((j_common_ptr) cinfo, post->whole_image,
-           (JDIMENSION) 0, post->strip_height, TRUE);
+          ((j_common_ptr)cinfo, post->whole_image,
+           (JDIMENSION)0, post->strip_height, TRUE);
       }
     } else {
       /* For single-pass processing without color quantization,
@@ -123,13 +129,12 @@ start_pass_dpost (j_decompress_ptr cinfo, J_BUF_MODE pass_mode)
  */
 
 METHODDEF(void)
-post_process_1pass (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+post_process_1pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION num_rows, max_rows;
 
   /* Fill the buffer, but not more than what we can dump out in one go. */
@@ -138,12 +143,13 @@ post_process_1pass (j_decompress_ptr cinfo,
   if (max_rows > post->strip_height)
     max_rows = post->strip_height;
   num_rows = 0;
-  (*cinfo->upsample->upsample) (cinfo,
-                input_buf, in_row_group_ctr, in_row_groups_avail,
-                post->buffer, &num_rows, max_rows);
+  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+                                in_row_groups_avail, post->buffer, &num_rows,
+                                max_rows);
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo,
-                post->buffer, output_buf + *out_row_ctr, (int) num_rows);
+  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer,
+                                       output_buf + *out_row_ctr,
+                                       (int)num_rows);
   *out_row_ctr += num_rows;
 }
 
@@ -155,34 +161,33 @@ post_process_1pass (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-post_process_prepass (j_decompress_ptr cinfo,
-                      JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                      JDIMENSION in_row_groups_avail,
-                      JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                      JDIMENSION out_rows_avail)
+post_process_prepass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                     JDIMENSION *in_row_group_ctr,
+                     JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                     JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION old_next_row, num_rows;
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, post->whole_image,
+        ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, TRUE);
   }
 
   /* Upsample some data (up to a strip height's worth). */
   old_next_row = post->next_row;
-  (*cinfo->upsample->upsample) (cinfo,
-                input_buf, in_row_group_ctr, in_row_groups_avail,
-                post->buffer, &post->next_row, post->strip_height);
+  (*cinfo->upsample->upsample) (cinfo, input_buf, in_row_group_ctr,
+                                in_row_groups_avail, post->buffer,
+                                &post->next_row, post->strip_height);
 
   /* Allow quantizer to scan new data.  No data is emitted, */
   /* but we advance out_row_ctr so outer loop can tell when we're done. */
   if (post->next_row > old_next_row) {
     num_rows = post->next_row - old_next_row;
     (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + old_next_row,
-                                         (JSAMPARRAY) NULL, (int) num_rows);
+                                         (JSAMPARRAY)NULL, (int)num_rows);
     *out_row_ctr += num_rows;
   }
 
@@ -199,19 +204,18 @@ post_process_prepass (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-post_process_2pass (j_decompress_ptr cinfo,
-                    JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-                    JDIMENSION in_row_groups_avail,
-                    JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-                    JDIMENSION out_rows_avail)
+post_process_2pass(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                   JDIMENSION *in_row_group_ctr,
+                   JDIMENSION in_row_groups_avail, JSAMPARRAY output_buf,
+                   JDIMENSION *out_row_ctr, JDIMENSION out_rows_avail)
 {
-  my_post_ptr post = (my_post_ptr) cinfo->post;
+  my_post_ptr post = (my_post_ptr)cinfo->post;
   JDIMENSION num_rows, max_rows;
 
   /* Reposition virtual buffer if at start of strip. */
   if (post->next_row == 0) {
     post->buffer = (*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr) cinfo, post->whole_image,
+        ((j_common_ptr)cinfo, post->whole_image,
          post->starting_row, post->strip_height, FALSE);
   }
 
@@ -226,9 +230,9 @@ post_process_2pass (j_decompress_ptr cinfo,
     num_rows = max_rows;
 
   /* Quantize and emit data. */
-  (*cinfo->cquantize->color_quantize) (cinfo,
-                post->buffer + post->next_row, output_buf + *out_row_ctr,
-                (int) num_rows);
+  (*cinfo->cquantize->color_quantize) (cinfo, post->buffer + post->next_row,
+                                       output_buf + *out_row_ctr,
+                                       (int)num_rows);
   *out_row_ctr += num_rows;
 
   /* Advance if we filled the strip. */
@@ -247,14 +251,14 @@ post_process_2pass (j_decompress_ptr cinfo,
  */
 
 GLOBAL(void)
-jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
+jinit_d_post_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 {
   my_post_ptr post;
 
   post = (my_post_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_post_controller));
-  cinfo->post = (struct jpeg_d_post_controller *) post;
+  cinfo->post = (struct jpeg_d_post_controller *)post;
   post->pub.start_pass = start_pass_dpost;
   post->whole_image = NULL;     /* flag for no virtual arrays */
   post->buffer = NULL;          /* flag for no strip buffer */
@@ -265,16 +269,16 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
      * an efficient number of rows for upsampling to return.
      * (In the presence of output rescaling, we might want to be smarter?)
      */
-    post->strip_height = (JDIMENSION) cinfo->max_v_samp_factor;
+    post->strip_height = (JDIMENSION)cinfo->max_v_samp_factor;
     if (need_full_buffer) {
       /* Two-pass color quantization: need full-image storage. */
       /* We round up the number of rows to a multiple of the strip height. */
 #ifdef QUANT_2PASS_SUPPORTED
       post->whole_image = (*cinfo->mem->request_virt_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE, FALSE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
          cinfo->output_width * cinfo->out_color_components,
-         (JDIMENSION) jround_up((long) cinfo->output_height,
-                                (long) post->strip_height),
+         (JDIMENSION)jround_up((long)cinfo->output_height,
+                               (long)post->strip_height),
          post->strip_height);
 #else
       ERREXIT(cinfo, JERR_BAD_BUFFER_MODE);
@@ -282,7 +286,7 @@ jinit_d_post_controller (j_decompress_ptr cinfo, boolean need_full_buffer)
     } else {
       /* One-pass color quantization: just make a strip buffer. */
       post->buffer = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
          cinfo->output_width * cinfo->out_color_components,
          post->strip_height);
     }
diff --git a/media/libjpeg/jdsample.c b/media/libjpeg/jdsample.c
index b1378e1512..eaad72a030 100644
--- a/media/libjpeg/jdsample.c
+++ b/media/libjpeg/jdsample.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2019-2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -36,9 +37,9 @@
  */
 
 METHODDEF(void)
-start_pass_upsample (j_decompress_ptr cinfo)
+start_pass_upsample(j_decompress_ptr cinfo)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
 
   /* Mark the conversion buffer empty */
   upsample->next_row_out = cinfo->max_v_samp_factor;
@@ -56,13 +57,12 @@ start_pass_upsample (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-sep_upsample (j_decompress_ptr cinfo,
-              JSAMPIMAGE input_buf, JDIMENSION *in_row_group_ctr,
-              JDIMENSION in_row_groups_avail,
-              JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
-              JDIMENSION out_rows_avail)
+sep_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+             JDIMENSION *in_row_group_ctr, JDIMENSION in_row_groups_avail,
+             JSAMPARRAY output_buf, JDIMENSION *out_row_ctr,
+             JDIMENSION out_rows_avail)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   int ci;
   jpeg_component_info *compptr;
   JDIMENSION num_rows;
@@ -84,7 +84,7 @@ sep_upsample (j_decompress_ptr cinfo,
   /* Color-convert and emit rows */
 
   /* How many we have in the buffer: */
-  num_rows = (JDIMENSION) (cinfo->max_v_samp_factor - upsample->next_row_out);
+  num_rows = (JDIMENSION)(cinfo->max_v_samp_factor - upsample->next_row_out);
   /* Not more than the distance to the end of the image.  Need this test
    * in case the image height is not a multiple of max_v_samp_factor:
    */
@@ -96,9 +96,8 @@ sep_upsample (j_decompress_ptr cinfo,
     num_rows = out_rows_avail;
 
   (*cinfo->cconvert->color_convert) (cinfo, upsample->color_buf,
-                                     (JDIMENSION) upsample->next_row_out,
-                                     output_buf + *out_row_ctr,
-                                     (int) num_rows);
+                                     (JDIMENSION)upsample->next_row_out,
+                                     output_buf + *out_row_ctr, (int)num_rows);
 
   /* Adjust counts */
   *out_row_ctr += num_rows;
@@ -124,8 +123,8 @@ sep_upsample (j_decompress_ptr cinfo,
  */
 
 METHODDEF(void)
-fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+fullsize_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                  JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = input_data;
 }
@@ -137,8 +136,8 @@ fullsize_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+noop_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   *output_data_ptr = NULL;      /* safety check */
 }
@@ -156,10 +155,10 @@ noop_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
   register JSAMPLE invalue;
@@ -178,15 +177,15 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       for (h = h_expand; h > 0; h--) {
         *outptr++ = invalue;
       }
     }
     /* Generate any additional output rows by duplicating the first one */
     if (v_expand > 1) {
-      jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-                        v_expand-1, cinfo->output_width);
+      jcopy_sample_rows(output_data, outrow, output_data, outrow + 1,
+                        v_expand - 1, cinfo->output_width);
     }
     inrow++;
     outrow += v_expand;
@@ -200,8 +199,8 @@ int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -214,7 +213,7 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[inrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -228,8 +227,8 @@ h2v1_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -243,12 +242,12 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
-    jcopy_sample_rows(output_data, outrow, output_data, outrow+1,
-                      1, cinfo->output_width);
+    jcopy_sample_rows(output_data, outrow, output_data, outrow + 1, 1,
+                      cinfo->output_width);
     inrow++;
     outrow += 2;
   }
@@ -271,8 +270,8 @@ h2v2_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr, outptr;
@@ -284,21 +283,21 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     inptr = input_data[inrow];
     outptr = output_data[inrow];
     /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
-    *outptr++ = (JSAMPLE) invalue;
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+    invalue = *inptr++;
+    *outptr++ = (JSAMPLE)invalue;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE) ((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+      invalue = (*inptr++) * 3;
+      *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
     }
 
     /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE) ((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
-    *outptr++ = (JSAMPLE) invalue;
+    invalue = *inptr;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
+    *outptr++ = (JSAMPLE)invalue;
   }
 }
 
@@ -311,15 +310,15 @@ h2v1_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr0, inptr1, outptr;
 #if BITS_IN_JSAMPLE == 8
-  int thiscolsum;
+  int thiscolsum, bias;
 #else
-  JLONG thiscolsum;
+  JLONG thiscolsum, bias;
 #endif
   JDIMENSION colctr;
   int inrow, outrow, v;
@@ -329,15 +328,18 @@ h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     for (v = 0; v < 2; v++) {
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
-      if (v == 0)               /* next nearest is row above */
-        inptr1 = input_data[inrow-1];
-      else                      /* next nearest is row below */
-        inptr1 = input_data[inrow+1];
+      if (v == 0) {             /* next nearest is row above */
+        inptr1 = input_data[inrow - 1];
+        bias = 1;
+      } else {                  /* next nearest is row below */
+        inptr1 = input_data[inrow + 1];
+        bias = 2;
+      }
       outptr = output_data[outrow++];
 
-      for(colctr = 0; colctr < compptr->downsampled_width; colctr++) {
-        thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-        *outptr++ = (JSAMPLE) ((thiscolsum + 1) >> 2);
+      for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
+        thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+        *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
     inrow++;
@@ -354,8 +356,8 @@ h1v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 METHODDEF(void)
-h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   register JSAMPROW inptr0, inptr1, outptr;
@@ -373,30 +375,30 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
       /* inptr0 points to nearest input row, inptr1 points to next nearest */
       inptr0 = input_data[inrow];
       if (v == 0)               /* next nearest is row above */
-        inptr1 = input_data[inrow-1];
+        inptr1 = input_data[inrow - 1];
       else                      /* next nearest is row below */
-        inptr1 = input_data[inrow+1];
+        inptr1 = input_data[inrow + 1];
       outptr = output_data[outrow++];
 
       /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-      lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+      thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+      nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+      lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
 
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-        nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-        *outptr++ = (JSAMPLE) ((thiscolsum * 3 + nextcolsum + 7) >> 4);
-        lastcolsum = thiscolsum; thiscolsum = nextcolsum;
+        nextcolsum = (*inptr0++) * 3 + (*inptr1++);
+        *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+        *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
+        lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
       }
 
       /* Special case for last column */
-      *outptr++ = (JSAMPLE) ((thiscolsum * 3 + lastcolsum + 8) >> 4);
-      *outptr++ = (JSAMPLE) ((thiscolsum * 4 + 7) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
+      *outptr++ = (JSAMPLE)((thiscolsum * 4 + 7) >> 4);
     }
     inrow++;
   }
@@ -408,7 +410,7 @@ h2v2_fancy_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jinit_upsampler (j_decompress_ptr cinfo)
+jinit_upsampler(j_decompress_ptr cinfo)
 {
   my_upsample_ptr upsample;
   int ci;
@@ -418,14 +420,14 @@ jinit_upsampler (j_decompress_ptr cinfo)
 
   if (!cinfo->master->jinit_upsampler_no_alloc) {
     upsample = (my_upsample_ptr)
-      (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   sizeof(my_upsampler));
-    cinfo->upsample = (struct jpeg_upsampler *) upsample;
+    cinfo->upsample = (struct jpeg_upsampler *)upsample;
     upsample->pub.start_pass = start_pass_upsample;
     upsample->pub.upsample = sep_upsample;
     upsample->pub.need_context_rows = FALSE; /* until we find out differently */
   } else
-    upsample = (my_upsample_ptr) cinfo->upsample;
+    upsample = (my_upsample_ptr)cinfo->upsample;
 
   if (cinfo->CCIR601_sampling)  /* this isn't supported */
     ERREXIT(cinfo, JERR_CCIR601_NOTIMPL);
@@ -451,7 +453,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
     v_out_group = cinfo->max_v_samp_factor;
     upsample->rowgroup_height[ci] = v_in_group; /* save for use later */
     need_buffer = TRUE;
-    if (! compptr->component_needed) {
+    if (!compptr->component_needed) {
       /* Don't bother to upsample an uninteresting component. */
       upsample->methods[ci] = noop_upsample;
       need_buffer = FALSE;
@@ -459,8 +461,7 @@ jinit_upsampler (j_decompress_ptr cinfo)
       /* Fullsize components can be processed without any work. */
       upsample->methods[ci] = fullsize_upsample;
       need_buffer = FALSE;
-    } else if (h_in_group * 2 == h_out_group &&
-               v_in_group == v_out_group) {
+    } else if (h_in_group * 2 == h_out_group && v_in_group == v_out_group) {
       /* Special cases for 2h1v upsampling */
       if (do_fancy && compptr->downsampled_width > 2) {
         if (jsimd_can_h2v1_fancy_upsample())
@@ -476,7 +477,13 @@ jinit_upsampler (j_decompress_ptr cinfo)
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-      upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+      if (jsimd_can_h1v2_fancy_upsample())
+        upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+      else
+#endif
+        upsample->methods[ci] = h1v2_fancy_upsample;
       upsample->pub.need_context_rows = TRUE;
     } else if (h_in_group * 2 == h_out_group &&
                v_in_group * 2 == v_out_group) {
@@ -502,16 +509,16 @@ jinit_upsampler (j_decompress_ptr cinfo)
       else
 #endif
         upsample->methods[ci] = int_upsample;
-      upsample->h_expand[ci] = (UINT8) (h_out_group / h_in_group);
-      upsample->v_expand[ci] = (UINT8) (v_out_group / v_in_group);
+      upsample->h_expand[ci] = (UINT8)(h_out_group / h_in_group);
+      upsample->v_expand[ci] = (UINT8)(v_out_group / v_in_group);
     } else
       ERREXIT(cinfo, JERR_FRACT_SAMPLE_NOTIMPL);
     if (need_buffer && !cinfo->master->jinit_upsampler_no_alloc) {
       upsample->color_buf[ci] = (*cinfo->mem->alloc_sarray)
-        ((j_common_ptr) cinfo, JPOOL_IMAGE,
-         (JDIMENSION) jround_up((long) cinfo->output_width,
-                                (long) cinfo->max_h_samp_factor),
-         (JDIMENSION) cinfo->max_v_samp_factor);
+        ((j_common_ptr)cinfo, JPOOL_IMAGE,
+         (JDIMENSION)jround_up((long)cinfo->output_width,
+                               (long)cinfo->max_h_samp_factor),
+         (JDIMENSION)cinfo->max_v_samp_factor);
     }
   }
 }
diff --git a/media/libjpeg/jdtrans.c b/media/libjpeg/jdtrans.c
index cfc85dd24c..d7ec4b83b3 100644
--- a/media/libjpeg/jdtrans.c
+++ b/media/libjpeg/jdtrans.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -16,10 +16,11 @@
 #define JPEG_INTERNALS
 #include "jinclude.h"
 #include "jpeglib.h"
+#include "jpegcomp.h"
 
 
 /* Forward declarations */
-LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
+LOCAL(void) transdecode_master_selection(j_decompress_ptr cinfo);
 
 
 /*
@@ -45,7 +46,7 @@ LOCAL(void) transdecode_master_selection (j_decompress_ptr cinfo);
  */
 
 GLOBAL(jvirt_barray_ptr *)
-jpeg_read_coefficients (j_decompress_ptr cinfo)
+jpeg_read_coefficients(j_decompress_ptr cinfo)
 {
   if (cinfo->global_state == DSTATE_READY) {
     /* First call: initialize active modules */
@@ -58,7 +59,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
       int retcode;
       /* Call progress monitor hook if present */
       if (cinfo->progress != NULL)
-        (*cinfo->progress->progress_monitor) ((j_common_ptr) cinfo);
+        (*cinfo->progress->progress_monitor) ((j_common_ptr)cinfo);
       /* Absorb some more input */
       retcode = (*cinfo->inputctl->consume_input) (cinfo);
       if (retcode == JPEG_SUSPENDED)
@@ -70,7 +71,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
           (retcode == JPEG_ROW_COMPLETED || retcode == JPEG_REACHED_SOS)) {
         if (++cinfo->progress->pass_counter >= cinfo->progress->pass_limit) {
           /* startup underestimated number of scans; ratchet up one scan */
-          cinfo->progress->pass_limit += (long) cinfo->total_iMCU_rows;
+          cinfo->progress->pass_limit += (long)cinfo->total_iMCU_rows;
         }
       }
     }
@@ -97,7 +98,7 @@ jpeg_read_coefficients (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-transdecode_master_selection (j_decompress_ptr cinfo)
+transdecode_master_selection(j_decompress_ptr cinfo)
 {
   /* This is effectively a buffered-image operation. */
   cinfo->buffered_image = TRUE;
@@ -129,7 +130,7 @@ transdecode_master_selection (j_decompress_ptr cinfo)
   jinit_d_coef_controller(cinfo, TRUE);
 
   /* We can now tell the memory manager to allocate virtual arrays. */
-  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr) cinfo);
+  (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
   /* Initialize input side of decompressor to consume first scan. */
   (*cinfo->inputctl->start_input_pass) (cinfo);
@@ -148,7 +149,7 @@ transdecode_master_selection (j_decompress_ptr cinfo)
       nscans = 1;
     }
     cinfo->progress->pass_counter = 0L;
-    cinfo->progress->pass_limit = (long) cinfo->total_iMCU_rows * nscans;
+    cinfo->progress->pass_limit = (long)cinfo->total_iMCU_rows * nscans;
     cinfo->progress->completed_passes = 0;
     cinfo->progress->total_passes = 1;
   }
diff --git a/media/libjpeg/jerror.c b/media/libjpeg/jerror.c
index c31acd9ef0..d544702937 100644
--- a/media/libjpeg/jerror.c
+++ b/media/libjpeg/jerror.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -44,7 +44,7 @@
  * want to refer to it directly.
  */
 
-#define JMESSAGE(code,string)   string ,
+#define JMESSAGE(code, string)  string,
 
 const char * const jpeg_std_message_table[] = {
 #include "jerror.h"
@@ -66,7 +66,7 @@ const char * const jpeg_std_message_table[] = {
  */
 
 METHODDEF(void)
-error_exit (j_common_ptr cinfo)
+error_exit(j_common_ptr cinfo)
 {
   /* Always display the message */
   (*cinfo->err->output_message) (cinfo);
@@ -94,7 +94,7 @@ error_exit (j_common_ptr cinfo)
  */
 
 METHODDEF(void)
-output_message (j_common_ptr cinfo)
+output_message(j_common_ptr cinfo)
 {
   char buffer[JMSG_LENGTH_MAX];
 
@@ -124,7 +124,7 @@ output_message (j_common_ptr cinfo)
  */
 
 METHODDEF(void)
-emit_message (j_common_ptr cinfo, int msg_level)
+emit_message(j_common_ptr cinfo, int msg_level)
 {
   struct jpeg_error_mgr *err = cinfo->err;
 
@@ -153,7 +153,7 @@ emit_message (j_common_ptr cinfo, int msg_level)
  */
 
 METHODDEF(void)
-format_message (j_common_ptr cinfo, char *buffer)
+format_message(j_common_ptr cinfo, char *buffer)
 {
   struct jpeg_error_mgr *err = cinfo->err;
   int msg_code = err->msg_code;
@@ -189,13 +189,13 @@ format_message (j_common_ptr cinfo, char *buffer)
 
   /* Format the message into the passed buffer */
   if (isstring)
-    sprintf(buffer, msgtext, err->msg_parm.s);
+    snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
   else
-    sprintf(buffer, msgtext,
-            err->msg_parm.i[0], err->msg_parm.i[1],
-            err->msg_parm.i[2], err->msg_parm.i[3],
-            err->msg_parm.i[4], err->msg_parm.i[5],
-            err->msg_parm.i[6], err->msg_parm.i[7]);
+    snprintf(buffer, JMSG_LENGTH_MAX, msgtext,
+             err->msg_parm.i[0], err->msg_parm.i[1],
+             err->msg_parm.i[2], err->msg_parm.i[3],
+             err->msg_parm.i[4], err->msg_parm.i[5],
+             err->msg_parm.i[6], err->msg_parm.i[7]);
 }
 
 
@@ -208,7 +208,7 @@ format_message (j_common_ptr cinfo, char *buffer)
  */
 
 METHODDEF(void)
-reset_error_mgr (j_common_ptr cinfo)
+reset_error_mgr(j_common_ptr cinfo)
 {
   cinfo->err->num_warnings = 0;
   /* trace_level is not reset since it is an application-supplied parameter */
@@ -227,7 +227,7 @@ reset_error_mgr (j_common_ptr cinfo)
  */
 
 GLOBAL(struct jpeg_error_mgr *)
-jpeg_std_error (struct jpeg_error_mgr *err)
+jpeg_std_error(struct jpeg_error_mgr *err)
 {
   err->error_exit = error_exit;
   err->emit_message = emit_message;
@@ -241,7 +241,7 @@ jpeg_std_error (struct jpeg_error_mgr *err)
 
   /* Initialize message table pointers */
   err->jpeg_message_table = jpeg_std_message_table;
-  err->last_jpeg_message = (int) JMSG_LASTMSGCODE - 1;
+  err->last_jpeg_message = (int)JMSG_LASTMSGCODE - 1;
 
   err->addon_message_table = NULL;
   err->first_addon_message = 0; /* for safety */
diff --git a/media/libjpeg/jerror.h b/media/libjpeg/jerror.h
index 11a07cb5d0..eb44a1140a 100644
--- a/media/libjpeg/jerror.h
+++ b/media/libjpeg/jerror.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2014, D. R. Commander.
+ * Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -28,7 +28,7 @@
 #define JMAKE_ENUM_LIST
 #else
 /* Repeated inclusions of this file are no-ops unless JMESSAGE is defined */
-#define JMESSAGE(code,string)
+#define JMESSAGE(code, string)
 #endif /* JERROR_H */
 #endif /* JMESSAGE */
 
@@ -36,7 +36,7 @@
 
 typedef enum {
 
-#define JMESSAGE(code,string)   code ,
+#define JMESSAGE(code, string)  code,
 
 #endif /* JMAKE_ENUM_LIST */
 
@@ -44,8 +44,7 @@ JMESSAGE(JMSG_NOMESSAGE, "Bogus message code %d") /* Must be first entry! */
 
 /* For maintenance convenience, list is alphabetical by message code name */
 #if JPEG_LIB_VERSION < 70
-JMESSAGE(JERR_ARITH_NOTIMPL,
-         "Sorry, arithmetic coding is not implemented")
+JMESSAGE(JERR_ARITH_NOTIMPL, "Sorry, arithmetic coding is not implemented")
 #endif
 JMESSAGE(JERR_BAD_ALIGN_TYPE, "ALIGN_TYPE is wrong, please fix")
 JMESSAGE(JERR_BAD_ALLOC_CHUNK, "MAX_ALLOC_CHUNK is wrong, please fix")
@@ -104,7 +103,7 @@ JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
          "Cannot transcode due to multiple use of quantization table %d")
 JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
 JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
-JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
+JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
 JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
 #if JPEG_LIB_VERSION >= 70
 JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
@@ -154,8 +153,7 @@ JMESSAGE(JTRC_HUFFBITS, "        %3d %3d %3d %3d %3d %3d %3d %3d")
 JMESSAGE(JTRC_JFIF, "JFIF APP0 marker: version %d.%02d, density %dx%d  %d")
 JMESSAGE(JTRC_JFIF_BADTHUMBNAILSIZE,
          "Warning: thumbnail image size does not match data length %u")
-JMESSAGE(JTRC_JFIF_EXTENSION,
-         "JFIF extension marker: type 0x%02x, length %u")
+JMESSAGE(JTRC_JFIF_EXTENSION, "JFIF extension marker: type 0x%02x, length %u")
 JMESSAGE(JTRC_JFIF_THUMBNAIL, "    with %d x %d thumbnail image")
 JMESSAGE(JTRC_MISC_MARKER, "Miscellaneous marker 0x%02x, length %u")
 JMESSAGE(JTRC_PARMLESS_MARKER, "Unexpected marker 0x%02x")
@@ -208,6 +206,11 @@ JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
 JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 #endif
+JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
 
 #ifdef JMAKE_ENUM_LIST
 
@@ -228,90 +231,101 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 /* The first parameter is either type of cinfo pointer */
 
 /* Fatal errors (print message and exit) */
-#define ERREXIT(cinfo,code)  \
+#define ERREXIT(cinfo, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT1(cinfo,code,p1)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT1(cinfo, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT2(cinfo,code,p1,p2)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT2(cinfo, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT3(cinfo,code,p1,p2,p3)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT3(cinfo, code, p1, p2, p3) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
    (cinfo)->err->msg_parm.i[2] = (p3), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXIT4(cinfo,code,p1,p2,p3,p4)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT4(cinfo, code, p1, p2, p3, p4) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
-#define ERREXITS(cinfo,code,str)  \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXITS(cinfo, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
-   (*(cinfo)->err->error_exit) ((j_common_ptr) (cinfo)))
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
 
 #define MAKESTMT(stuff)         do { stuff } while (0)
 
 /* Nonfatal errors (we can keep going, but the data is probably corrupt) */
-#define WARNMS(cinfo,code)  \
+#define WARNMS(cinfo, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS1(cinfo,code,p1)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS1(cinfo, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
-#define WARNMS2(cinfo,code,p1,p2)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
+#define WARNMS2(cinfo, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), -1))
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), -1))
 
 /* Informational/debugging messages */
-#define TRACEMS(cinfo,lvl,code)  \
+#define TRACEMS(cinfo, lvl, code) \
   ((cinfo)->err->msg_code = (code), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS1(cinfo,lvl,code,p1)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS1(cinfo, lvl, code, p1) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS2(cinfo,lvl,code,p1,p2)  \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS2(cinfo, lvl, code, p1, p2) \
   ((cinfo)->err->msg_code = (code), \
    (cinfo)->err->msg_parm.i[0] = (p1), \
    (cinfo)->err->msg_parm.i[1] = (p2), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
-#define TRACEMS3(cinfo,lvl,code,p1,p2,p3)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
+#define TRACEMS3(cinfo, lvl, code, p1, p2, p3) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS4(cinfo,lvl,code,p1,p2,p3,p4)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS4(cinfo, lvl, code, p1, p2, p3, p4) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS5(cinfo,lvl,code,p1,p2,p3,p4,p5)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS5(cinfo, lvl, code, p1, p2, p3, p4, p5) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
            _mp[4] = (p5); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMS8(cinfo,lvl,code,p1,p2,p3,p4,p5,p6,p7,p8)  \
-  MAKESTMT(int * _mp = (cinfo)->err->msg_parm.i; \
-           _mp[0] = (p1); _mp[1] = (p2); _mp[2] = (p3); _mp[3] = (p4); \
-           _mp[4] = (p5); _mp[5] = (p6); _mp[6] = (p7); _mp[7] = (p8); \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMS8(cinfo, lvl, code, p1, p2, p3, p4, p5, p6, p7, p8) \
+  MAKESTMT(int *_mp = (cinfo)->err->msg_parm.i; \
+           _mp[0] = (p1);  _mp[1] = (p2);  _mp[2] = (p3);  _mp[3] = (p4); \
+           _mp[4] = (p5);  _mp[5] = (p6);  _mp[6] = (p7);  _mp[7] = (p8); \
            (cinfo)->err->msg_code = (code); \
-           (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)); )
-#define TRACEMSS(cinfo,lvl,code,str)  \
+           (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)); )
+#define TRACEMSS(cinfo, lvl, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
-   (*(cinfo)->err->emit_message) ((j_common_ptr) (cinfo), (lvl)))
+   (cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
+   (*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
 
 #endif /* JERROR_H */
diff --git a/media/libjpeg/jfdctflt.c b/media/libjpeg/jfdctflt.c
index b3da3ebda8..ab6f6d0825 100644
--- a/media/libjpeg/jfdctflt.c
+++ b/media/libjpeg/jfdctflt.c
@@ -57,7 +57,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_float (FAST_FLOAT *data)
+jpeg_fdct_float(FAST_FLOAT *data)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -68,7 +68,7 @@ jpeg_fdct_float (FAST_FLOAT *data)
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -88,7 +88,7 @@ jpeg_fdct_float (FAST_FLOAT *data)
     dataptr[0] = tmp10 + tmp11; /* phase 3 */
     dataptr[4] = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
     dataptr[2] = tmp13 + z1;    /* phase 5 */
     dataptr[6] = tmp13 - z1;
 
@@ -99,10 +99,10 @@ jpeg_fdct_float (FAST_FLOAT *data)
     tmp12 = tmp6 + tmp7;
 
     /* The rotator is modified from fig 4-8 to avoid extra negations. */
-    z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
-    z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
-    z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
-    z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+    z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+    z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+    z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
 
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
@@ -118,15 +118,15 @@ jpeg_fdct_float (FAST_FLOAT *data)
   /* Pass 2: process columns. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part */
 
@@ -135,12 +135,12 @@ jpeg_fdct_float (FAST_FLOAT *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
-    dataptr[DCTSIZE*4] = tmp10 - tmp11;
+    dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+    dataptr[DCTSIZE * 4] = tmp10 - tmp11;
 
-    z1 = (tmp12 + tmp13) * ((FAST_FLOAT) 0.707106781); /* c4 */
-    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
-    dataptr[DCTSIZE*6] = tmp13 - z1;
+    z1 = (tmp12 + tmp13) * ((FAST_FLOAT)0.707106781); /* c4 */
+    dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+    dataptr[DCTSIZE * 6] = tmp13 - z1;
 
     /* Odd part */
 
@@ -149,18 +149,18 @@ jpeg_fdct_float (FAST_FLOAT *data)
     tmp12 = tmp6 + tmp7;
 
     /* The rotator is modified from fig 4-8 to avoid extra negations. */
-    z5 = (tmp10 - tmp12) * ((FAST_FLOAT) 0.382683433); /* c6 */
-    z2 = ((FAST_FLOAT) 0.541196100) * tmp10 + z5; /* c2-c6 */
-    z4 = ((FAST_FLOAT) 1.306562965) * tmp12 + z5; /* c2+c6 */
-    z3 = tmp11 * ((FAST_FLOAT) 0.707106781); /* c4 */
+    z5 = (tmp10 - tmp12) * ((FAST_FLOAT)0.382683433); /* c6 */
+    z2 = ((FAST_FLOAT)0.541196100) * tmp10 + z5; /* c2-c6 */
+    z4 = ((FAST_FLOAT)1.306562965) * tmp12 + z5; /* c2+c6 */
+    z3 = tmp11 * ((FAST_FLOAT)0.707106781); /* c4 */
 
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
-    dataptr[DCTSIZE*3] = z13 - z2;
-    dataptr[DCTSIZE*1] = z11 + z4;
-    dataptr[DCTSIZE*7] = z11 - z4;
+    dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+    dataptr[DCTSIZE * 3] = z13 - z2;
+    dataptr[DCTSIZE * 1] = z11 + z4;
+    dataptr[DCTSIZE * 7] = z11 - z4;
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/media/libjpeg/jfdctfst.c b/media/libjpeg/jfdctfst.c
index 5cd83a7b8e..4c9ce0de8f 100644
--- a/media/libjpeg/jfdctfst.c
+++ b/media/libjpeg/jfdctfst.c
@@ -79,10 +79,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_0_382683433  ((JLONG)   98)         /* FIX(0.382683433) */
-#define FIX_0_541196100  ((JLONG)  139)         /* FIX(0.541196100) */
-#define FIX_0_707106781  ((JLONG)  181)         /* FIX(0.707106781) */
-#define FIX_1_306562965  ((JLONG)  334)         /* FIX(1.306562965) */
+#define FIX_0_382683433  ((JLONG)98)            /* FIX(0.382683433) */
+#define FIX_0_541196100  ((JLONG)139)           /* FIX(0.541196100) */
+#define FIX_0_707106781  ((JLONG)181)           /* FIX(0.707106781) */
+#define FIX_1_306562965  ((JLONG)334)           /* FIX(1.306562965) */
 #else
 #define FIX_0_382683433  FIX(0.382683433)
 #define FIX_0_541196100  FIX(0.541196100)
@@ -98,7 +98,7 @@
 
 #ifndef USE_ACCURATE_ROUNDING
 #undef DESCALE
-#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#define DESCALE(x, n)  RIGHT_SHIFT(x, n)
 #endif
 
 
@@ -106,7 +106,7 @@
  * descale to yield a DCTELEM result.
  */
 
-#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const)  ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
 
 
 /*
@@ -114,7 +114,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_ifast (DCTELEM *data)
+jpeg_fdct_ifast(DCTELEM *data)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -126,7 +126,7 @@ jpeg_fdct_ifast (DCTELEM *data)
   /* Pass 1: process rows. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -176,15 +176,15 @@ jpeg_fdct_ifast (DCTELEM *data)
   /* Pass 2: process columns. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part */
 
@@ -193,12 +193,12 @@ jpeg_fdct_ifast (DCTELEM *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = tmp10 + tmp11; /* phase 3 */
-    dataptr[DCTSIZE*4] = tmp10 - tmp11;
+    dataptr[DCTSIZE * 0] = tmp10 + tmp11; /* phase 3 */
+    dataptr[DCTSIZE * 4] = tmp10 - tmp11;
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
-    dataptr[DCTSIZE*2] = tmp13 + z1; /* phase 5 */
-    dataptr[DCTSIZE*6] = tmp13 - z1;
+    dataptr[DCTSIZE * 2] = tmp13 + z1; /* phase 5 */
+    dataptr[DCTSIZE * 6] = tmp13 - z1;
 
     /* Odd part */
 
@@ -215,10 +215,10 @@ jpeg_fdct_ifast (DCTELEM *data)
     z11 = tmp7 + z3;            /* phase 5 */
     z13 = tmp7 - z3;
 
-    dataptr[DCTSIZE*5] = z13 + z2; /* phase 6 */
-    dataptr[DCTSIZE*3] = z13 - z2;
-    dataptr[DCTSIZE*1] = z11 + z4;
-    dataptr[DCTSIZE*7] = z11 - z4;
+    dataptr[DCTSIZE * 5] = z13 + z2; /* phase 6 */
+    dataptr[DCTSIZE * 3] = z13 - z2;
+    dataptr[DCTSIZE * 1] = z11 + z4;
+    dataptr[DCTSIZE * 7] = z11 - z4;
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/media/libjpeg/jfdctint.c b/media/libjpeg/jfdctint.c
index 169bb942ce..c95a3a7fb8 100644
--- a/media/libjpeg/jfdctint.c
+++ b/media/libjpeg/jfdctint.c
@@ -1,14 +1,14 @@
 /*
  * jfdctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform).
  *
  * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
@@ -93,18 +93,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -129,9 +129,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -140,7 +140,7 @@
  */
 
 GLOBAL(void)
-jpeg_fdct_islow (DCTELEM *data)
+jpeg_fdct_islow(DCTELEM *data)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   JLONG tmp10, tmp11, tmp12, tmp13;
@@ -154,7 +154,7 @@ jpeg_fdct_islow (DCTELEM *data)
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
     tmp0 = dataptr[0] + dataptr[7];
     tmp7 = dataptr[0] - dataptr[7];
     tmp1 = dataptr[1] + dataptr[6];
@@ -173,14 +173,14 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
-    dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
+    dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS);
+    dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS);
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-                                   CONST_BITS-PASS1_BITS);
-    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-                                   CONST_BITS-PASS1_BITS);
+    dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+                                  CONST_BITS - PASS1_BITS);
+    dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+                                  CONST_BITS - PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
@@ -197,18 +197,18 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
 
-    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
-    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
-    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
+    dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS - PASS1_BITS);
+    dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS - PASS1_BITS);
+    dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS - PASS1_BITS);
+    dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS - PASS1_BITS);
 
     dataptr += DCTSIZE;         /* advance pointer to next row */
   }
@@ -219,15 +219,15 @@ jpeg_fdct_islow (DCTELEM *data)
    */
 
   dataptr = data;
-  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
-    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
-    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
-    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
-    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
-    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
-    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
-    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
-    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
+  for (ctr = DCTSIZE - 1; ctr >= 0; ctr--) {
+    tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
+    tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
+    tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
+    tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
+    tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
+    tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
+    tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
+    tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 
     /* Even part per LL&M figure 1 --- note that published figure is faulty;
      * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
@@ -238,14 +238,16 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
 
-    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
-    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
+    dataptr[DCTSIZE * 0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS);
+    dataptr[DCTSIZE * 4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS);
 
     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
-    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
-                                           CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE * 2] =
+      (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
+                       CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 6] =
+      (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, -FIX_1_847759065),
+                       CONST_BITS + PASS1_BITS);
 
     /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
      * cK represents cos(K*pi/16).
@@ -262,22 +264,22 @@ jpeg_fdct_islow (DCTELEM *data)
     tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
 
-    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
-                                           CONST_BITS+PASS1_BITS);
-    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
-                                           CONST_BITS+PASS1_BITS);
+    dataptr[DCTSIZE * 7] = (DCTELEM)DESCALE(tmp4 + z1 + z3,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 5] = (DCTELEM)DESCALE(tmp5 + z2 + z4,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 3] = (DCTELEM)DESCALE(tmp6 + z2 + z3,
+                                            CONST_BITS + PASS1_BITS);
+    dataptr[DCTSIZE * 1] = (DCTELEM)DESCALE(tmp7 + z1 + z4,
+                                            CONST_BITS + PASS1_BITS);
 
     dataptr++;                  /* advance pointer to next column */
   }
diff --git a/media/libjpeg/jidctflt.c b/media/libjpeg/jidctflt.c
index 68c521ed7e..5aee74e232 100644
--- a/media/libjpeg/jidctflt.c
+++ b/media/libjpeg/jidctflt.c
@@ -61,7 +61,7 @@
  * entry; produce a float result.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((FAST_FLOAT) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((FAST_FLOAT)(coef)) * (quantval))
 
 
 /*
@@ -69,9 +69,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   FAST_FLOAT tmp10, tmp11, tmp12, tmp13;
@@ -83,12 +83,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPLE *range_limit = cinfo->sample_range_limit;
   int ctr;
   FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */
-  #define _0_125 ((FLOAT_MULT_TYPE)0.125)
+#define _0_125  ((FLOAT_MULT_TYPE)0.125)
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (FLOAT_MULT_TYPE *) compptr->dct_table;
+  quantptr = (FLOAT_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -100,22 +100,22 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0],
-                                    quantptr[DCTSIZE*0] * _0_125);
+      FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE * 0],
+                                    quantptr[DCTSIZE * 0] * _0_125);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -125,16 +125,16 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0] * _0_125);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2] * _0_125);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4] * _0_125);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6] * _0_125);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0] * _0_125);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2] * _0_125);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4] * _0_125);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6] * _0_125);
 
     tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
 
     tmp13 = tmp1 + tmp3;        /* phases 5-3 */
-    tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */
+    tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT)1.414213562) - tmp13; /* 2*c4 */
 
     tmp0 = tmp10 + tmp13;       /* phase 2 */
     tmp3 = tmp10 - tmp13;
@@ -143,10 +143,10 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1] * _0_125);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3] * _0_125);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5] * _0_125);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7] * _0_125);
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1] * _0_125);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3] * _0_125);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5] * _0_125);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7] * _0_125);
 
     z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
@@ -154,24 +154,24 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z12 = tmp4 - tmp7;
 
     tmp7 = z11 + z13;           /* phase 5 */
-    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */
+    tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562); /* 2*c4 */
 
-    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
-    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+    z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+    tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 - tmp5;
 
-    wsptr[DCTSIZE*0] = tmp0 + tmp7;
-    wsptr[DCTSIZE*7] = tmp0 - tmp7;
-    wsptr[DCTSIZE*1] = tmp1 + tmp6;
-    wsptr[DCTSIZE*6] = tmp1 - tmp6;
-    wsptr[DCTSIZE*2] = tmp2 + tmp5;
-    wsptr[DCTSIZE*5] = tmp2 - tmp5;
-    wsptr[DCTSIZE*3] = tmp3 + tmp4;
-    wsptr[DCTSIZE*4] = tmp3 - tmp4;
+    wsptr[DCTSIZE * 0] = tmp0 + tmp7;
+    wsptr[DCTSIZE * 7] = tmp0 - tmp7;
+    wsptr[DCTSIZE * 1] = tmp1 + tmp6;
+    wsptr[DCTSIZE * 6] = tmp1 - tmp6;
+    wsptr[DCTSIZE * 2] = tmp2 + tmp5;
+    wsptr[DCTSIZE * 5] = tmp2 - tmp5;
+    wsptr[DCTSIZE * 3] = tmp3 + tmp4;
+    wsptr[DCTSIZE * 4] = tmp3 - tmp4;
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -192,12 +192,12 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Apply signed->unsigned and prepare float->int conversion */
-    z5 = wsptr[0] + ((FAST_FLOAT) CENTERJSAMPLE + (FAST_FLOAT) 0.5);
+    z5 = wsptr[0] + ((FAST_FLOAT)CENTERJSAMPLE + (FAST_FLOAT)0.5);
     tmp10 = z5 + wsptr[4];
     tmp11 = z5 - wsptr[4];
 
     tmp13 = wsptr[2] + wsptr[6];
-    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13;
+    tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT)1.414213562) - tmp13;
 
     tmp0 = tmp10 + tmp13;
     tmp3 = tmp10 - tmp13;
@@ -212,11 +212,11 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z12 = wsptr[1] - wsptr[7];
 
     tmp7 = z11 + z13;
-    tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562);
+    tmp11 = (z11 - z13) * ((FAST_FLOAT)1.414213562);
 
-    z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */
-    tmp10 = z5 - z12 * ((FAST_FLOAT) 1.082392200); /* 2*(c2-c6) */
-    tmp12 = z5 - z10 * ((FAST_FLOAT) 2.613125930); /* 2*(c2+c6) */
+    z5 = (z10 + z12) * ((FAST_FLOAT)1.847759065); /* 2*c2 */
+    tmp10 = z5 - z12 * ((FAST_FLOAT)1.082392200); /* 2*(c2-c6) */
+    tmp12 = z5 - z10 * ((FAST_FLOAT)2.613125930); /* 2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;
     tmp5 = tmp11 - tmp6;
@@ -224,14 +224,14 @@ jpeg_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: float->int conversion and range-limit */
 
-    outptr[0] = range_limit[((int) (tmp0 + tmp7)) & RANGE_MASK];
-    outptr[7] = range_limit[((int) (tmp0 - tmp7)) & RANGE_MASK];
-    outptr[1] = range_limit[((int) (tmp1 + tmp6)) & RANGE_MASK];
-    outptr[6] = range_limit[((int) (tmp1 - tmp6)) & RANGE_MASK];
-    outptr[2] = range_limit[((int) (tmp2 + tmp5)) & RANGE_MASK];
-    outptr[5] = range_limit[((int) (tmp2 - tmp5)) & RANGE_MASK];
-    outptr[3] = range_limit[((int) (tmp3 + tmp4)) & RANGE_MASK];
-    outptr[4] = range_limit[((int) (tmp3 - tmp4)) & RANGE_MASK];
+    outptr[0] = range_limit[((int)(tmp0 + tmp7)) & RANGE_MASK];
+    outptr[7] = range_limit[((int)(tmp0 - tmp7)) & RANGE_MASK];
+    outptr[1] = range_limit[((int)(tmp1 + tmp6)) & RANGE_MASK];
+    outptr[6] = range_limit[((int)(tmp1 - tmp6)) & RANGE_MASK];
+    outptr[2] = range_limit[((int)(tmp2 + tmp5)) & RANGE_MASK];
+    outptr[5] = range_limit[((int)(tmp2 - tmp5)) & RANGE_MASK];
+    outptr[3] = range_limit[((int)(tmp3 + tmp4)) & RANGE_MASK];
+    outptr[4] = range_limit[((int)(tmp3 - tmp4)) & RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
diff --git a/media/libjpeg/jidctfst.c b/media/libjpeg/jidctfst.c
index 10db739b86..89a20c937b 100644
--- a/media/libjpeg/jidctfst.c
+++ b/media/libjpeg/jidctfst.c
@@ -92,10 +92,10 @@
  */
 
 #if CONST_BITS == 8
-#define FIX_1_082392200  ((JLONG)  277)         /* FIX(1.082392200) */
-#define FIX_1_414213562  ((JLONG)  362)         /* FIX(1.414213562) */
-#define FIX_1_847759065  ((JLONG)  473)         /* FIX(1.847759065) */
-#define FIX_2_613125930  ((JLONG)  669)         /* FIX(2.613125930) */
+#define FIX_1_082392200  ((JLONG)277)           /* FIX(1.082392200) */
+#define FIX_1_414213562  ((JLONG)362)           /* FIX(1.414213562) */
+#define FIX_1_847759065  ((JLONG)473)           /* FIX(1.847759065) */
+#define FIX_2_613125930  ((JLONG)669)           /* FIX(2.613125930) */
 #else
 #define FIX_1_082392200  FIX(1.082392200)
 #define FIX_1_414213562  FIX(1.414213562)
@@ -111,7 +111,7 @@
 
 #ifndef USE_ACCURATE_ROUNDING
 #undef DESCALE
-#define DESCALE(x,n)  RIGHT_SHIFT(x, n)
+#define DESCALE(x, n)  RIGHT_SHIFT(x, n)
 #endif
 
 
@@ -119,7 +119,7 @@
  * descale to yield a DCTELEM result.
  */
 
-#define MULTIPLY(var,const)  ((DCTELEM) DESCALE((var) * (const), CONST_BITS))
+#define MULTIPLY(var, const)  ((DCTELEM)DESCALE((var) * (const), CONST_BITS))
 
 
 /* Dequantize a coefficient by multiplying it by the multiplier-table
@@ -129,10 +129,10 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define DEQUANTIZE(coef,quantval)  (((IFAST_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((IFAST_MULT_TYPE)(coef)) * (quantval))
 #else
-#define DEQUANTIZE(coef,quantval)  \
-        DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS)
+#define DEQUANTIZE(coef, quantval) \
+  DESCALE((coef) * (quantval), IFAST_SCALE_BITS - PASS1_BITS)
 #endif
 
 
@@ -147,19 +147,19 @@
 #else
 #define DCTELEMBITS  32         /* DCTELEM must be 32 bits */
 #endif
-#define IRIGHT_SHIFT(x,shft)  \
-    ((ishift_temp = (x)) < 0 ? \
-     (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \
-     (ishift_temp >> (shft)))
+#define IRIGHT_SHIFT(x, shft) \
+  ((ishift_temp = (x)) < 0 ? \
+   (ishift_temp >> (shft)) | ((~((DCTELEM)0)) << (DCTELEMBITS - (shft))) : \
+   (ishift_temp >> (shft)))
 #else
 #define ISHIFT_TEMPS
-#define IRIGHT_SHIFT(x,shft)    ((x) >> (shft))
+#define IRIGHT_SHIFT(x, shft)   ((x) >> (shft))
 #endif
 
 #ifdef USE_ACCURATE_ROUNDING
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n))
+#define IDESCALE(x, n)  ((int)IRIGHT_SHIFT((x) + (1 << ((n) - 1)), n))
 #else
-#define IDESCALE(x,n)  ((int) IRIGHT_SHIFT(x, n))
+#define IDESCALE(x, n)  ((int)IRIGHT_SHIFT(x, n))
 #endif
 
 
@@ -168,9 +168,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   DCTELEM tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   DCTELEM tmp10, tmp11, tmp12, tmp13;
@@ -188,7 +188,7 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -200,21 +200,21 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      int dcval = (int) DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+      int dcval = (int)DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -224,10 +224,10 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = tmp0 + tmp2;        /* phase 3 */
     tmp11 = tmp0 - tmp2;
@@ -242,10 +242,10 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    tmp4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    tmp5 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp6 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp7 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     z13 = tmp6 + tmp5;          /* phase 6 */
     z10 = tmp6 - tmp5;
@@ -257,20 +257,20 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+    tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
     tmp4 = tmp10 + tmp5;
 
-    wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+    wsptr[DCTSIZE * 0] = (int)(tmp0 + tmp7);
+    wsptr[DCTSIZE * 7] = (int)(tmp0 - tmp7);
+    wsptr[DCTSIZE * 1] = (int)(tmp1 + tmp6);
+    wsptr[DCTSIZE * 6] = (int)(tmp1 - tmp6);
+    wsptr[DCTSIZE * 2] = (int)(tmp2 + tmp5);
+    wsptr[DCTSIZE * 5] = (int)(tmp2 - tmp5);
+    wsptr[DCTSIZE * 4] = (int)(tmp3 + tmp4);
+    wsptr[DCTSIZE * 3] = (int)(tmp3 - tmp4);
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -296,8 +296,8 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[IDESCALE(wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval =
+        range_limit[IDESCALE(wsptr[0], PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -315,12 +315,12 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
-    tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
+    tmp10 = ((DCTELEM)wsptr[0] + (DCTELEM)wsptr[4]);
+    tmp11 = ((DCTELEM)wsptr[0] - (DCTELEM)wsptr[4]);
 
-    tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
-    tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562)
-            - tmp13;
+    tmp13 = ((DCTELEM)wsptr[2] + (DCTELEM)wsptr[6]);
+    tmp12 =
+      MULTIPLY((DCTELEM)wsptr[2] - (DCTELEM)wsptr[6], FIX_1_414213562) - tmp13;
 
     tmp0 = tmp10 + tmp13;
     tmp3 = tmp10 - tmp13;
@@ -329,17 +329,17 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
-    z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
-    z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
-    z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
+    z13 = (DCTELEM)wsptr[5] + (DCTELEM)wsptr[3];
+    z10 = (DCTELEM)wsptr[5] - (DCTELEM)wsptr[3];
+    z11 = (DCTELEM)wsptr[1] + (DCTELEM)wsptr[7];
+    z12 = (DCTELEM)wsptr[1] - (DCTELEM)wsptr[7];
 
     tmp7 = z11 + z13;           /* phase 5 */
     tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
 
     z5 = MULTIPLY(z10 + z12, FIX_1_847759065); /* 2*c2 */
     tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; /* 2*(c2-c6) */
-    tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; /* -2*(c2+c6) */
+    tmp12 = MULTIPLY(z10, -FIX_2_613125930) + z5; /* -2*(c2+c6) */
 
     tmp6 = tmp12 - tmp7;        /* phase 2 */
     tmp5 = tmp11 - tmp6;
@@ -347,22 +347,22 @@ jpeg_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: scale down by a factor of 8 and range-limit */
 
-    outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] =
+      range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[7] =
+      range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[1] =
+      range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[6] =
+      range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[2] =
+      range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[5] =
+      range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[4] =
+      range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS + 3) & RANGE_MASK];
+    outptr[3] =
+      range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS + 3) & RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
diff --git a/media/libjpeg/jidctint.c b/media/libjpeg/jidctint.c
index 3ac6caf692..bb08748019 100644
--- a/media/libjpeg/jidctint.c
+++ b/media/libjpeg/jidctint.c
@@ -1,15 +1,15 @@
 /*
  * jidctint.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modification developed 2002-2009 by Guido Vollbeding.
+ * Modification developed 2002-2018 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
  * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
  * must also perform dequantization of the input coefficients.
  *
@@ -115,18 +115,18 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_298631336  ((JLONG)  2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((JLONG)  3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((JLONG)  4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((JLONG)  9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((JLONG)  12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((JLONG)  16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((JLONG)  16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((JLONG)  25172)       /* FIX(3.072711026) */
+#define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
+#define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
+#define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
+#define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
+#define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
@@ -151,9 +151,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -162,7 +162,7 @@
  * are 16 bits or less, so either int or short multiply will work.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
 
 
 /*
@@ -170,9 +170,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3;
   JLONG tmp10, tmp11, tmp12, tmp13;
@@ -191,7 +191,7 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   /* furthermore, we scale the results by 2**PASS1_BITS. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; ctr--) {
     /* Due to quantization, we will usually find that many of the input
@@ -203,22 +203,22 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * column DCT calculations can be simplified this way.
      */
 
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
-        inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
+        inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                             quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
-      wsptr[DCTSIZE*4] = dcval;
-      wsptr[DCTSIZE*5] = dcval;
-      wsptr[DCTSIZE*6] = dcval;
-      wsptr[DCTSIZE*7] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
+      wsptr[DCTSIZE * 4] = dcval;
+      wsptr[DCTSIZE * 5] = dcval;
+      wsptr[DCTSIZE * 6] = dcval;
+      wsptr[DCTSIZE * 7] = dcval;
 
       inptr++;                  /* advance pointers to next column */
       quantptr++;
@@ -229,15 +229,15 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
 
     tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
     tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
@@ -251,10 +251,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
@@ -266,10 +266,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
@@ -281,14 +281,14 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
+    wsptr[DCTSIZE * 0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[DCTSIZE * 4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS - PASS1_BITS);
 
     inptr++;                    /* advance pointers to next column */
     quantptr++;
@@ -314,8 +314,8 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -334,15 +334,15 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part: reverse the even part of the forward DCT. */
     /* The rotator is sqrt(2)*c(-6). */
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
     tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    tmp0 = LEFT_SHIFT((JLONG) wsptr[0] + (JLONG) wsptr[4], CONST_BITS);
-    tmp1 = LEFT_SHIFT((JLONG) wsptr[0] - (JLONG) wsptr[4], CONST_BITS);
+    tmp0 = LEFT_SHIFT((JLONG)wsptr[0] + (JLONG)wsptr[4], CONST_BITS);
+    tmp1 = LEFT_SHIFT((JLONG)wsptr[0] - (JLONG)wsptr[4], CONST_BITS);
 
     tmp10 = tmp0 + tmp3;
     tmp13 = tmp0 - tmp3;
@@ -353,10 +353,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
      */
 
-    tmp0 = (JLONG) wsptr[7];
-    tmp1 = (JLONG) wsptr[5];
-    tmp2 = (JLONG) wsptr[3];
-    tmp3 = (JLONG) wsptr[1];
+    tmp0 = (JLONG)wsptr[7];
+    tmp1 = (JLONG)wsptr[5];
+    tmp2 = (JLONG)wsptr[3];
+    tmp3 = (JLONG)wsptr[1];
 
     z1 = tmp0 + tmp3;
     z2 = tmp1 + tmp2;
@@ -368,10 +368,10 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
     tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
     tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
-    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
-    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
-    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
-    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
+    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
+    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
+    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
+    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
 
     z3 += z5;
     z4 += z5;
@@ -383,30 +383,30 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp3,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) DESCALE(tmp10 - tmp3,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp11 + tmp2,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) DESCALE(tmp11 - tmp2,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 + tmp1,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) DESCALE(tmp12 - tmp1,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp13 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) DESCALE(tmp13 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp3,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)DESCALE(tmp10 - tmp3,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp11 + tmp2,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)DESCALE(tmp11 - tmp2,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)DESCALE(tmp12 + tmp1,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)DESCALE(tmp12 - tmp1,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)DESCALE(tmp13 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)DESCALE(tmp13 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -417,16 +417,16 @@ jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 7x7 output block.
+ * producing a reduced-size 7x7 output block.
  *
  * Optimized algorithm with 12 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/14).
  */
 
 GLOBAL(void)
-jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
   JLONG z1, z2, z3;
@@ -436,25 +436,25 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[7*7];   /* buffers data between passes */
+  int workspace[7 * 7];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp13 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp13 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -468,15 +468,15 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
     tmp0 = tmp1 - tmp2;
     tmp1 += tmp2;
-    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
     tmp1 += tmp2;
     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
     tmp0 += z2;
@@ -484,13 +484,13 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[7 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 6] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 5] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 4] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[7 * 3] = (int)RIGHT_SHIFT(tmp13, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 7 rows from work array, store into output array. */
@@ -502,12 +502,12 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp13 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp13 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
     tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
@@ -521,15 +521,15 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
 
     tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
     tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
     tmp0 = tmp1 - tmp2;
     tmp1 += tmp2;
-    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
+    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
     tmp1 += tmp2;
     z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
     tmp0 += z2;
@@ -537,27 +537,27 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 7;         /* advance pointer to next row */
   }
@@ -573,9 +573,9 @@ jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
@@ -585,35 +585,35 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[6*6];   /* buffers data between passes */
+  int workspace[6 * 6];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
-    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
-    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS - PASS1_BITS);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -621,12 +621,12 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[6*1] = (int) (tmp11 + tmp1);
-    wsptr[6*4] = (int) (tmp11 - tmp1);
-    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[6 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 5] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 1] = (int)(tmp11 + tmp1);
+    wsptr[6 * 4] = (int)(tmp11 - tmp1);
+    wsptr[6 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[6 * 3] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 6 rows from work array, store into output array. */
@@ -638,22 +638,22 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (JLONG) wsptr[4];
+    tmp2 = (JLONG)wsptr[4];
     tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
     tmp1 = tmp0 + tmp10;
     tmp11 = tmp0 - tmp10 - tmp10;
-    tmp10 = (JLONG) wsptr[2];
+    tmp10 = (JLONG)wsptr[2];
     tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
     tmp10 = tmp1 + tmp0;
     tmp12 = tmp1 - tmp0;
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
     tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
     tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
     tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
@@ -661,24 +661,24 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 6;         /* advance pointer to next row */
   }
@@ -694,9 +694,9 @@ jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
   JLONG z1, z2, z3;
@@ -706,23 +706,23 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[5*5];   /* buffers data between passes */
+  int workspace[5 * 5];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    tmp12 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
@@ -732,8 +732,8 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -741,11 +741,11 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
+    wsptr[5 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 4] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 3] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[5 * 2] = (int)RIGHT_SHIFT(tmp12, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 5 rows from work array, store into output array. */
@@ -757,10 +757,10 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp12 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp12 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
-    tmp0 = (JLONG) wsptr[2];
-    tmp1 = (JLONG) wsptr[4];
+    tmp0 = (JLONG)wsptr[2];
+    tmp1 = (JLONG)wsptr[4];
     z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
     z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
     z3 = tmp12 + z2;
@@ -770,8 +770,8 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z2 = (JLONG) wsptr[1];
-    z3 = (JLONG) wsptr[3];
+    z2 = (JLONG)wsptr[1];
+    z3 = (JLONG)wsptr[3];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
     tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
@@ -779,21 +779,21 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 5;         /* advance pointer to next row */
   }
@@ -809,9 +809,9 @@ jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JCOEFPTR inptr;
@@ -820,36 +820,36 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[3*3];   /* buffers data between passes */
+  int workspace[3 * 3];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
-    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
 
-    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
+    wsptr[3 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[3 * 2] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[3 * 1] = (int)RIGHT_SHIFT(tmp2, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 3 rows from work array, store into output array. */
@@ -861,29 +861,29 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
-    tmp2 = (JLONG) wsptr[2];
+    tmp2 = (JLONG)wsptr[2];
     tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
     tmp10 = tmp0 + tmp12;
     tmp2 = tmp0 - tmp12 - tmp12;
 
     /* Odd part */
 
-    tmp12 = (JLONG) wsptr[1];
+    tmp12 = (JLONG)wsptr[1];
     tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 3;         /* advance pointer to next row */
   }
@@ -899,9 +899,9 @@ jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG z1, z2, z3, z4;
@@ -911,25 +911,25 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*9];   /* buffers data between passes */
+  int workspace[8 * 9];         /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -949,12 +949,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
-    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
 
     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
@@ -966,15 +966,15 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
-    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
-    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
-    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
-    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp14, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 9 rows from work array, store into output array. */
@@ -986,12 +986,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
     tmp1 = tmp0 + tmp3;
@@ -1011,12 +1011,12 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
-    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
+    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
 
     tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
     tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
@@ -1028,33 +1028,33 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13 + tmp3,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp13 - tmp3,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1070,9 +1070,9 @@ jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
@@ -1083,32 +1083,32 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*10];  /* buffers data between passes */
+  int workspace[8 * 10];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
     tmp11 = z3 - z2;
 
     tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
-                        CONST_BITS-PASS1_BITS);  /* c0 = (c4-c8)*2 */
+                        CONST_BITS - PASS1_BITS); /* c0 = (c4-c8)*2 */
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1121,10 +1121,10 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1148,16 +1148,16 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2] = (int) (tmp22 + tmp12);
-    wsptr[8*7] = (int) (tmp22 - tmp12);
-    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2] = (int)(tmp22 + tmp12);
+    wsptr[8 * 7] = (int)(tmp22 - tmp12);
+    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 10 rows from work array, store into output array. */
@@ -1169,9 +1169,9 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
     z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
     tmp10 = z3 + z1;
@@ -1179,8 +1179,8 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[6];
 
     z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
     tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
@@ -1193,11 +1193,11 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
     z3 = LEFT_SHIFT(z3, CONST_BITS);
-    z4 = (JLONG) wsptr[7];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z2 + z4;
     tmp13 = z2 - z4;
@@ -1220,36 +1220,36 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
-    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                              CONST_BITS+PASS1_BITS+3)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
+    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                             CONST_BITS + PASS1_BITS + 3) &
+                            RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1258,16 +1258,16 @@ jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 11x11 output block.
+ * producing an 11x11 output block.
  *
  * Optimized algorithm with 24 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/22).
  */
 
 GLOBAL(void)
-jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1278,30 +1278,30 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*11];  /* buffers data between passes */
+  int workspace[8 * 11];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
+    tmp10 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
     z4 = z1 + z3;
-    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1316,10 +1316,10 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1331,26 +1331,26 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
-    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
-    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 11 rows from work array, store into output array. */
@@ -1362,17 +1362,17 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp10 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp10 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[4];
-    z3 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[4];
+    z3 = (JLONG)wsptr[6];
 
     tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
     tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
     z4 = z1 + z3;
-    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
+    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
     z4 -= z2;
     tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
     tmp21 = tmp20 + tmp23 + tmp25 -
@@ -1387,10 +1387,10 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z1 + z2;
     tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
@@ -1402,48 +1402,48 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
     tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
     tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
-    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
+    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
     tmp11 += z1;
     tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
-    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
+    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
              MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
              MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1459,9 +1459,9 @@ jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
@@ -1472,32 +1472,32 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*12];  /* buffers data between passes */
+  int workspace[8 * 12];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z3 = LEFT_SHIFT(z3, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
     z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
@@ -1517,19 +1517,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
-    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
 
     tmp10 = z1 + z3;
     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
-    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
@@ -1543,18 +1543,18 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 12 rows from work array, store into output array. */
@@ -1566,19 +1566,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z3 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z3 = LEFT_SHIFT(z3, CONST_BITS);
 
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
 
-    z1 = (JLONG) wsptr[2];
+    z1 = (JLONG)wsptr[2];
     z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z2 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[6];
     z2 = LEFT_SHIFT(z2, CONST_BITS);
 
     tmp12 = z1 - z2;
@@ -1598,19 +1598,19 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
-    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
+    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
 
     tmp10 = z1 + z3;
     tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
     tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
     tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
-    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
+    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
     tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
     tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
     tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
@@ -1624,42 +1624,42 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1675,9 +1675,9 @@ jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1688,25 +1688,25 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*13];  /* buffers data between passes */
+  int workspace[8 * 13];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1721,22 +1721,22 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
 
     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
-    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
 
     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
 
-    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
-    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
+    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
 
     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
@@ -1744,13 +1744,13 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
-    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
-    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
     tmp11 += tmp14;
     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
-    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
     tmp12 += tmp14;
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
@@ -1763,19 +1763,19 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 13 rows from work array, store into output array. */
@@ -1787,12 +1787,12 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[4];
-    z4 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[4];
+    z4 = (JLONG)wsptr[6];
 
     tmp10 = z3 + z4;
     tmp11 = z3 - z4;
@@ -1807,22 +1807,22 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
 
     tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
-    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
+    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
 
     tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
     tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
 
-    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
-    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
+    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
+    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
 
     tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
     tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
@@ -1830,13 +1830,13 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
     tmp10 = tmp11 + tmp12 + tmp13 -
             MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
-    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
+    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
     tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
     tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
-    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
+    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
     tmp11 += tmp14;
     tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
-    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
+    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
     tmp12 += tmp14;
     tmp13 += tmp14;
     tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
@@ -1849,45 +1849,45 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -1903,9 +1903,9 @@ jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
@@ -1916,22 +1916,22 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*14];  /* buffers data between passes */
+  int workspace[8 * 14];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -1941,10 +1941,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp12 = z1 - z4;
 
     tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
-                        CONST_BITS-PASS1_BITS);  /* c0 = (c4+c12-c8)*2 */
+                        CONST_BITS - PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
@@ -1962,10 +1962,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
     tmp13 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
@@ -1978,7 +1978,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
     tmp16 += tmp15;
     z1    += z4;
-    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
+    z4    = MULTIPLY(z2 + z3, -FIX(0.158341681)) - tmp13;  /* -c13 */
     tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
     tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
     z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
@@ -1989,20 +1989,20 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) (tmp23 + tmp13);
-    wsptr[8*10] = (int) (tmp23 - tmp13);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)(tmp23 + tmp13);
+    wsptr[8 * 10] = (int)(tmp23 - tmp13);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 14 rows from work array, store into output array. */
@@ -2014,9 +2014,9 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
-    z4 = (JLONG) wsptr[4];
+    z4 = (JLONG)wsptr[4];
     z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
     z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
     z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
@@ -2027,8 +2027,8 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[6];
 
     z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
 
@@ -2046,10 +2046,10 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
     z4 = LEFT_SHIFT(z4, CONST_BITS);
 
     tmp14 = z1 + z3;
@@ -2061,7 +2061,7 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     z1    -= z2;
     tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
     tmp16 += tmp15;
-    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
+    tmp13 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - z4;     /* -c13 */
     tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
     tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
     tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
@@ -2072,48 +2072,48 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -2129,9 +2129,9 @@ jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2142,25 +2142,25 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*15];  /* buffers data between passes */
+  int workspace[8 * 15];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     z1 = LEFT_SHIFT(z1, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
+    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2195,19 +2195,19 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
 
-    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
-    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
+    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
     z2 = z1 - z4;
     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
 
@@ -2220,21 +2220,21 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 15 rows from work array, store into output array. */
@@ -2246,12 +2246,12 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    z1 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     z1 = LEFT_SHIFT(z1, CONST_BITS);
 
-    z2 = (JLONG) wsptr[2];
-    z3 = (JLONG) wsptr[4];
-    z4 = (JLONG) wsptr[6];
+    z2 = (JLONG)wsptr[2];
+    z3 = (JLONG)wsptr[4];
+    z4 = (JLONG)wsptr[6];
 
     tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
     tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
@@ -2286,19 +2286,19 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z4 = (JLONG) wsptr[5];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z4 = (JLONG)wsptr[5];
     z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
-    z4 = (JLONG) wsptr[7];
+    z4 = (JLONG)wsptr[7];
 
     tmp13 = z2 - z4;
     tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
     tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
     tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
 
-    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
-    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
+    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
+    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
     z2 = z1 - z4;
     tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
 
@@ -2311,51 +2311,51 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
@@ -2371,9 +2371,9 @@ jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                 JCOEFPTR coef_block,
-                 JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                JDIMENSION output_col)
 {
   JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
@@ -2384,23 +2384,23 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[8*16];  /* buffers data between passes */
+  int workspace[8 * 16];        /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2409,8 +2409,8 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2431,10 +2431,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
 
     tmp11 = z1 + z3;
 
@@ -2455,13 +2455,13 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
     z2    += z4;
-    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
     tmp1  += z1;
     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
-    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
     tmp12 += z2;
-    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
     tmp2  += z2;
     tmp3  += z2;
     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
@@ -2470,22 +2470,22 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
-    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
-    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
-    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
-    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
-    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
-    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
-    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
-    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
-    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
-    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
-    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
-    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
+    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 15] = (int)RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS - PASS1_BITS);
+    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS - PASS1_BITS);
+    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS - PASS1_BITS);
   }
 
   /* Pass 2: process 16 rows from work array, store into output array. */
@@ -2497,10 +2497,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     /* Even part */
 
     /* Add fudge factor here for final descale. */
-    tmp0 = (JLONG) wsptr[0] + (ONE << (PASS1_BITS+2));
+    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
 
-    z1 = (JLONG) wsptr[4];
+    z1 = (JLONG)wsptr[4];
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
     tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
 
@@ -2509,8 +2509,8 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp12 = tmp0 + tmp2;
     tmp13 = tmp0 - tmp2;
 
-    z1 = (JLONG) wsptr[2];
-    z2 = (JLONG) wsptr[6];
+    z1 = (JLONG)wsptr[2];
+    z2 = (JLONG)wsptr[6];
     z3 = z1 - z2;
     z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
     z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
@@ -2531,10 +2531,10 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[1];
-    z2 = (JLONG) wsptr[3];
-    z3 = (JLONG) wsptr[5];
-    z4 = (JLONG) wsptr[7];
+    z1 = (JLONG)wsptr[1];
+    z2 = (JLONG)wsptr[3];
+    z3 = (JLONG)wsptr[5];
+    z4 = (JLONG)wsptr[7];
 
     tmp11 = z1 + z3;
 
@@ -2555,13 +2555,13 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
     tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
     z2    += z4;
-    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
+    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
     tmp1  += z1;
     tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
-    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
+    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
     tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
     tmp12 += z2;
-    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
+    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
     tmp2  += z2;
     tmp3  += z2;
     z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
@@ -2570,54 +2570,54 @@ jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Final output stage */
 
-    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
-    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
-                                               CONST_BITS+PASS1_BITS+3)
-                             & RANGE_MASK];
+    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp0,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[15] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp0,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp1,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp1,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp2,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp2,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp3,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp3,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp10,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp11,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp12,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27 + tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
+    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp27 - tmp13,
+                                              CONST_BITS + PASS1_BITS + 3) &
+                             RANGE_MASK];
 
     wsptr += 8;         /* advance pointer to next row */
   }
diff --git a/media/libjpeg/jidctred.c b/media/libjpeg/jidctred.c
index 7a81803b8d..1dd65a94d9 100644
--- a/media/libjpeg/jidctred.c
+++ b/media/libjpeg/jidctred.c
@@ -1,7 +1,7 @@
 /*
  * jidctred.c
  *
- * This file was part of the Independent JPEG Group's software.
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, D. R. Commander.
@@ -58,20 +58,20 @@
  */
 
 #if CONST_BITS == 13
-#define FIX_0_211164243  ((JLONG)  1730)        /* FIX(0.211164243) */
-#define FIX_0_509795579  ((JLONG)  4176)        /* FIX(0.509795579) */
-#define FIX_0_601344887  ((JLONG)  4926)        /* FIX(0.601344887) */
-#define FIX_0_720959822  ((JLONG)  5906)        /* FIX(0.720959822) */
-#define FIX_0_765366865  ((JLONG)  6270)        /* FIX(0.765366865) */
-#define FIX_0_850430095  ((JLONG)  6967)        /* FIX(0.850430095) */
-#define FIX_0_899976223  ((JLONG)  7373)        /* FIX(0.899976223) */
-#define FIX_1_061594337  ((JLONG)  8697)        /* FIX(1.061594337) */
-#define FIX_1_272758580  ((JLONG)  10426)       /* FIX(1.272758580) */
-#define FIX_1_451774981  ((JLONG)  11893)       /* FIX(1.451774981) */
-#define FIX_1_847759065  ((JLONG)  15137)       /* FIX(1.847759065) */
-#define FIX_2_172734803  ((JLONG)  17799)       /* FIX(2.172734803) */
-#define FIX_2_562915447  ((JLONG)  20995)       /* FIX(2.562915447) */
-#define FIX_3_624509785  ((JLONG)  29692)       /* FIX(3.624509785) */
+#define FIX_0_211164243  ((JLONG)1730)          /* FIX(0.211164243) */
+#define FIX_0_509795579  ((JLONG)4176)          /* FIX(0.509795579) */
+#define FIX_0_601344887  ((JLONG)4926)          /* FIX(0.601344887) */
+#define FIX_0_720959822  ((JLONG)5906)          /* FIX(0.720959822) */
+#define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
+#define FIX_0_850430095  ((JLONG)6967)          /* FIX(0.850430095) */
+#define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
+#define FIX_1_061594337  ((JLONG)8697)          /* FIX(1.061594337) */
+#define FIX_1_272758580  ((JLONG)10426)         /* FIX(1.272758580) */
+#define FIX_1_451774981  ((JLONG)11893)         /* FIX(1.451774981) */
+#define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
+#define FIX_2_172734803  ((JLONG)17799)         /* FIX(2.172734803) */
+#define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
+#define FIX_3_624509785  ((JLONG)29692)         /* FIX(3.624509785) */
 #else
 #define FIX_0_211164243  FIX(0.211164243)
 #define FIX_0_509795579  FIX(0.509795579)
@@ -98,9 +98,9 @@
  */
 
 #if BITS_IN_JSAMPLE == 8
-#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
+#define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
 #else
-#define MULTIPLY(var,const)  ((var) * (const))
+#define MULTIPLY(var, const)  ((var) * (const))
 #endif
 
 
@@ -109,7 +109,7 @@
  * are 16 bits or less, so either int or short multiply will work.
  */
 
-#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
+#define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
 
 
 /*
@@ -118,9 +118,9 @@
  */
 
 GLOBAL(void)
-jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp2, tmp10, tmp12;
   JLONG z1, z2, z3, z4;
@@ -130,69 +130,73 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*4];     /* buffers data between passes */
+  int workspace[DCTSIZE * 4];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
     /* Don't bother to process column 4, because second pass won't use it */
-    if (ctr == DCTSIZE-4)
+    if (ctr == DCTSIZE - 4)
       continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
-        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*5] == 0 &&
-        inptr[DCTSIZE*6] == 0 && inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
+        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 5] == 0 &&
+        inptr[DCTSIZE * 6] == 0 && inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero; we need not examine term 4 for 4x4 output */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                                        quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
-      wsptr[DCTSIZE*2] = dcval;
-      wsptr[DCTSIZE*3] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
+      wsptr[DCTSIZE * 2] = dcval;
+      wsptr[DCTSIZE * 3] = dcval;
 
       continue;
     }
 
     /* Even part */
 
-    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS+1);
+    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS + 1);
 
-    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
 
-    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, - FIX_0_765366865);
+    tmp2 = MULTIPLY(z2, FIX_1_847759065) + MULTIPLY(z3, -FIX_0_765366865);
 
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    z2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    z4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    z2 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    z4 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
 
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+    tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+           MULTIPLY(z2,  FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+           MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+           MULTIPLY(z4,  FIX_1_061594337);  /* sqrt(2) * ( c5+c7) */
 
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+    tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+           MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+           MULTIPLY(z3,  FIX_0_899976223) + /* sqrt(2) * (c3-c7) */
+           MULTIPLY(z4,  FIX_2_562915447);  /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*3] = (int) DESCALE(tmp10 - tmp2, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp12 + tmp0, CONST_BITS-PASS1_BITS+1);
-    wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 - tmp0, CONST_BITS-PASS1_BITS+1);
+    wsptr[DCTSIZE * 0] =
+      (int)DESCALE(tmp10 + tmp2, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 3] =
+      (int)DESCALE(tmp10 - tmp2, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 1] =
+      (int)DESCALE(tmp12 + tmp0, CONST_BITS - PASS1_BITS + 1);
+    wsptr[DCTSIZE * 2] =
+      (int)DESCALE(tmp12 - tmp0, CONST_BITS - PASS1_BITS + 1);
   }
 
   /* Pass 2: process 4 rows from work array, store into output array. */
@@ -206,8 +210,8 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
     if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 &&
         wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -221,45 +225,45 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp0 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+1);
+    tmp0 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 1);
 
-    tmp2 = MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-         + MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865);
+    tmp2 = MULTIPLY((JLONG)wsptr[2],  FIX_1_847759065) +
+           MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865);
 
     tmp10 = tmp0 + tmp2;
     tmp12 = tmp0 - tmp2;
 
     /* Odd part */
 
-    z1 = (JLONG) wsptr[7];
-    z2 = (JLONG) wsptr[5];
-    z3 = (JLONG) wsptr[3];
-    z4 = (JLONG) wsptr[1];
+    z1 = (JLONG)wsptr[7];
+    z2 = (JLONG)wsptr[5];
+    z3 = (JLONG)wsptr[3];
+    z4 = (JLONG)wsptr[1];
 
-    tmp0 = MULTIPLY(z1, - FIX_0_211164243) /* sqrt(2) * (c3-c1) */
-         + MULTIPLY(z2, FIX_1_451774981) /* sqrt(2) * (c3+c7) */
-         + MULTIPLY(z3, - FIX_2_172734803) /* sqrt(2) * (-c1-c5) */
-         + MULTIPLY(z4, FIX_1_061594337); /* sqrt(2) * (c5+c7) */
+    tmp0 = MULTIPLY(z1, -FIX_0_211164243) + /* sqrt(2) * ( c3-c1) */
+           MULTIPLY(z2,  FIX_1_451774981) + /* sqrt(2) * ( c3+c7) */
+           MULTIPLY(z3, -FIX_2_172734803) + /* sqrt(2) * (-c1-c5) */
+           MULTIPLY(z4,  FIX_1_061594337);  /* sqrt(2) * ( c5+c7) */
 
-    tmp2 = MULTIPLY(z1, - FIX_0_509795579) /* sqrt(2) * (c7-c5) */
-         + MULTIPLY(z2, - FIX_0_601344887) /* sqrt(2) * (c5-c1) */
-         + MULTIPLY(z3, FIX_0_899976223) /* sqrt(2) * (c3-c7) */
-         + MULTIPLY(z4, FIX_2_562915447); /* sqrt(2) * (c1+c3) */
+    tmp2 = MULTIPLY(z1, -FIX_0_509795579) + /* sqrt(2) * (c7-c5) */
+           MULTIPLY(z2, -FIX_0_601344887) + /* sqrt(2) * (c5-c1) */
+           MULTIPLY(z3, FIX_0_899976223) +  /* sqrt(2) * (c3-c7) */
+           MULTIPLY(z4, FIX_2_562915447);   /* sqrt(2) * (c1+c3) */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp2,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[3] = range_limit[(int) DESCALE(tmp10 - tmp2,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp12 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
-    outptr[2] = range_limit[(int) DESCALE(tmp12 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3+1)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp2,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[3] = range_limit[(int)DESCALE(tmp10 - tmp2,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp12 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
+    outptr[2] = range_limit[(int)DESCALE(tmp12 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 1) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -272,9 +276,9 @@ jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   JLONG tmp0, tmp10, z1;
   JCOEFPTR inptr;
@@ -283,50 +287,52 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   JSAMPROW outptr;
   JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   int ctr;
-  int workspace[DCTSIZE*2];     /* buffers data between passes */
+  int workspace[DCTSIZE * 2];   /* buffers data between passes */
   SHIFT_TEMPS
 
   /* Pass 1: process columns from input, store into work array. */
 
   inptr = coef_block;
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   wsptr = workspace;
   for (ctr = DCTSIZE; ctr > 0; inptr++, quantptr++, wsptr++, ctr--) {
     /* Don't bother to process columns 2,4,6 */
-    if (ctr == DCTSIZE-2 || ctr == DCTSIZE-4 || ctr == DCTSIZE-6)
+    if (ctr == DCTSIZE - 2 || ctr == DCTSIZE - 4 || ctr == DCTSIZE - 6)
       continue;
-    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*3] == 0 &&
-        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*7] == 0) {
+    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 3] == 0 &&
+        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 7] == 0) {
       /* AC terms all zero; we need not examine terms 2,4,6 for 2x2 output */
-      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]),
-                             PASS1_BITS);
+      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
+                             quantptr[DCTSIZE * 0]), PASS1_BITS);
 
-      wsptr[DCTSIZE*0] = dcval;
-      wsptr[DCTSIZE*1] = dcval;
+      wsptr[DCTSIZE * 0] = dcval;
+      wsptr[DCTSIZE * 1] = dcval;
 
       continue;
     }
 
     /* Even part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
-    tmp10 = LEFT_SHIFT(z1, CONST_BITS+2);
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
+    tmp10 = LEFT_SHIFT(z1, CONST_BITS + 2);
 
     /* Odd part */
 
-    z1 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
-    tmp0 = MULTIPLY(z1, - FIX_0_720959822); /* sqrt(2) * (c7-c5+c3-c1) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
-    tmp0 += MULTIPLY(z1, FIX_0_850430095); /* sqrt(2) * (-c1+c3+c5+c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
-    tmp0 += MULTIPLY(z1, - FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
-    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
-    tmp0 += MULTIPLY(z1, FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
+    tmp0 = MULTIPLY(z1, -FIX_0_720959822);  /* sqrt(2) * ( c7-c5+c3-c1) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
+    tmp0 += MULTIPLY(z1, FIX_0_850430095);  /* sqrt(2) * (-c1+c3+c5+c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
+    tmp0 += MULTIPLY(z1, -FIX_1_272758580); /* sqrt(2) * (-c1+c3-c5-c7) */
+    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
+    tmp0 += MULTIPLY(z1, FIX_3_624509785);  /* sqrt(2) * ( c1+c3+c5+c7) */
 
     /* Final output stage */
 
-    wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp0, CONST_BITS-PASS1_BITS+2);
-    wsptr[DCTSIZE*1] = (int) DESCALE(tmp10 - tmp0, CONST_BITS-PASS1_BITS+2);
+    wsptr[DCTSIZE * 0] =
+      (int)DESCALE(tmp10 + tmp0, CONST_BITS - PASS1_BITS + 2);
+    wsptr[DCTSIZE * 1] =
+      (int)DESCALE(tmp10 - tmp0, CONST_BITS - PASS1_BITS + 2);
   }
 
   /* Pass 2: process 2 rows from work array, store into output array. */
@@ -339,8 +345,8 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 #ifndef NO_ZERO_ROW_TEST
     if (wsptr[1] == 0 && wsptr[3] == 0 && wsptr[5] == 0 && wsptr[7] == 0) {
       /* AC terms all zero */
-      JSAMPLE dcval = range_limit[(int) DESCALE((JLONG) wsptr[0], PASS1_BITS+3)
-                                  & RANGE_MASK];
+      JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
+                                               PASS1_BITS + 3) & RANGE_MASK];
 
       outptr[0] = dcval;
       outptr[1] = dcval;
@@ -352,23 +358,23 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
     /* Even part */
 
-    tmp10 = LEFT_SHIFT((JLONG) wsptr[0], CONST_BITS+2);
+    tmp10 = LEFT_SHIFT((JLONG)wsptr[0], CONST_BITS + 2);
 
     /* Odd part */
 
-    tmp0 = MULTIPLY((JLONG) wsptr[7], - FIX_0_720959822) /* sqrt(2) * (c7-c5+c3-c1) */
-         + MULTIPLY((JLONG) wsptr[5], FIX_0_850430095) /* sqrt(2) * (-c1+c3+c5+c7) */
-         + MULTIPLY((JLONG) wsptr[3], - FIX_1_272758580) /* sqrt(2) * (-c1+c3-c5-c7) */
-         + MULTIPLY((JLONG) wsptr[1], FIX_3_624509785); /* sqrt(2) * (c1+c3+c5+c7) */
+    tmp0 = MULTIPLY((JLONG)wsptr[7], -FIX_0_720959822) + /* sqrt(2) * ( c7-c5+c3-c1) */
+           MULTIPLY((JLONG)wsptr[5],  FIX_0_850430095) + /* sqrt(2) * (-c1+c3+c5+c7) */
+           MULTIPLY((JLONG)wsptr[3], -FIX_1_272758580) + /* sqrt(2) * (-c1+c3-c5-c7) */
+           MULTIPLY((JLONG)wsptr[1],  FIX_3_624509785);  /* sqrt(2) * ( c1+c3+c5+c7) */
 
     /* Final output stage */
 
-    outptr[0] = range_limit[(int) DESCALE(tmp10 + tmp0,
-                                          CONST_BITS+PASS1_BITS+3+2)
-                            & RANGE_MASK];
-    outptr[1] = range_limit[(int) DESCALE(tmp10 - tmp0,
-                                          CONST_BITS+PASS1_BITS+3+2)
-                            & RANGE_MASK];
+    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 2) &
+                            RANGE_MASK];
+    outptr[1] = range_limit[(int)DESCALE(tmp10 - tmp0,
+                                         CONST_BITS + PASS1_BITS + 3 + 2) &
+                            RANGE_MASK];
 
     wsptr += DCTSIZE;           /* advance pointer to next row */
   }
@@ -381,9 +387,9 @@ jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
  */
 
 GLOBAL(void)
-jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-               JCOEFPTR coef_block,
-               JSAMPARRAY output_buf, JDIMENSION output_col)
+jpeg_idct_1x1(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+              JDIMENSION output_col)
 {
   int dcval;
   ISLOW_MULT_TYPE *quantptr;
@@ -393,9 +399,9 @@ jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
   /* We hardly need an inverse DCT routine for this: just take the
    * average pixel value, which is one-eighth of the DC coefficient.
    */
-  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
+  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
-  dcval = (int) DESCALE((JLONG) dcval, 3);
+  dcval = (int)DESCALE((JLONG)dcval, 3);
 
   output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
 }
diff --git a/media/libjpeg/jinclude.h b/media/libjpeg/jinclude.h
index d461a1aa16..120614b25c 100644
--- a/media/libjpeg/jinclude.h
+++ b/media/libjpeg/jinclude.h
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1994, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,68 +17,117 @@
  * JPEG library.  Most applications need only include jpeglib.h.
  */
 
+#ifndef __JINCLUDE_H__
+#define __JINCLUDE_H__
 
 /* Include auto-config file to find out which system include files we need. */
 
 #include "jconfig.h"            /* auto configuration options */
+#include "jconfigint.h"
 #define JCONFIG_INCLUDED        /* so that jpeglib.h doesn't do it again */
 
 /*
- * We need the NULL macro and size_t typedef.
- * On an ANSI-conforming system it is sufficient to include <stddef.h>.
- * Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to
- * pull in <sys/types.h> as well.
  * Note that the core JPEG library does not require <stdio.h>;
  * only the default error handler and data source/destination modules do.
  * But we must pull it in because of the references to FILE in jpeglib.h.
  * You can remove those references if you want to compile without <stdio.h>.
  */
 
-#ifdef HAVE_STDDEF_H
 #include <stddef.h>
-#endif
-
-#ifdef HAVE_STDLIB_H
 #include <stdlib.h>
-#endif
-
-#ifdef NEED_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
 #include <stdio.h>
-
-/*
- * We need memory copying and zeroing functions, plus strncpy().
- * ANSI and System V implementations declare these in <string.h>.
- * BSD doesn't have the mem() functions, but it does have bcopy()/bzero().
- * Some systems may declare memset and memcpy in <memory.h>.
- *
- * NOTE: we assume the size parameters to these functions are of type size_t.
- * Change the casts in these macros if not!
- */
-
-#ifdef NEED_BSD_STRINGS
-
-#include <strings.h>
-#define MEMZERO(target,size)    bzero((void *)(target), (size_t)(size))
-#define MEMCOPY(dest,src,size)  bcopy((const void *)(src), (void *)(dest), (size_t)(size))
-
-#else /* not BSD, assume ANSI/SysV string lib */
-
 #include <string.h>
-#define MEMZERO(target,size)    memset((void *)(target), 0, (size_t)(size))
-#define MEMCOPY(dest,src,size)  memcpy((void *)(dest), (const void *)(src), (size_t)(size))
-
-#endif
 
 /*
- * The modules that use fread() and fwrite() always invoke them through
- * these macros.  On some systems you may need to twiddle the argument casts.
- * CAUTION: argument order is different from underlying functions!
+ * These macros/inline functions facilitate using Microsoft's "safe string"
+ * functions with Visual Studio builds without the need to scatter #ifdefs
+ * throughout the code base.
  */
 
-#define JFREAD(file,buf,sizeofbuf)  \
-  ((size_t) fread((void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
-#define JFWRITE(file,buf,sizeofbuf)  \
-  ((size_t) fwrite((const void *) (buf), (size_t) 1, (size_t) (sizeofbuf), (file)))
+
+#ifndef NO_GETENV
+
+#ifdef _MSC_VER
+
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+  size_t required_size;
+
+  return (int)getenv_s(&required_size, buffer, buffer_size, name);
+}
+
+#else /* _MSC_VER */
+
+#include <errno.h>
+
+/* This provides a similar interface to the Microsoft/C11 getenv_s() function,
+ * but other than parameter validation, it has no advantages over getenv().
+ */
+
+static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
+{
+  char *env;
+
+  if (!buffer) {
+    if (buffer_size == 0)
+      return 0;
+    else
+      return (errno = EINVAL);
+  }
+  if (buffer_size == 0)
+    return (errno = EINVAL);
+  if (!name) {
+    *buffer = 0;
+    return 0;
+  }
+
+  env = getenv(name);
+  if (!env)
+  {
+    *buffer = 0;
+    return 0;
+  }
+
+  if (strlen(env) + 1 > buffer_size) {
+    *buffer = 0;
+    return ERANGE;
+  }
+
+  strncpy(buffer, env, buffer_size);
+
+  return 0;
+}
+
+#endif /* _MSC_VER */
+
+#endif /* NO_GETENV */
+
+
+#ifndef NO_PUTENV
+
+#ifdef _WIN32
+
+#define PUTENV_S(name, value)  _putenv_s(name, value)
+
+#else
+
+/* This provides a similar interface to the Microsoft _putenv_s() function, but
+ * other than parameter validation, it has no advantages over setenv().
+ */
+
+static INLINE int PUTENV_S(const char *name, const char *value)
+{
+  if (!name || !value)
+    return (errno = EINVAL);
+
+  setenv(name, value, 1);
+
+  return errno;
+}
+
+#endif /* _WIN32 */
+
+#endif /* NO_PUTENV */
+
+
+#endif /* JINCLUDE_H */
diff --git a/media/libjpeg/jmemmgr.c b/media/libjpeg/jmemmgr.c
index 2a8d8401f4..8f5a4ab1c7 100644
--- a/media/libjpeg/jmemmgr.c
+++ b/media/libjpeg/jmemmgr.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2016, D. R. Commander.
+ * Copyright (C) 2016, 2021-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -32,18 +32,14 @@
 #include "jinclude.h"
 #include "jpeglib.h"
 #include "jmemsys.h"            /* import the system-dependent declarations */
+#if !defined(_MSC_VER) || _MSC_VER > 1600
 #include <stdint.h>
-#include <limits.h>             /* some NDKs define SIZE_MAX in limits.h */
-
-#ifndef NO_GETENV
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare getenv() */
-extern char *getenv (const char *name);
-#endif
 #endif
+#include <limits.h>
 
 
 LOCAL(size_t)
-round_up_pow2 (size_t a, size_t b)
+round_up_pow2(size_t a, size_t b)
 /* a rounded up to the next multiple of b, i.e. ceil(a/b)*b */
 /* Assumes a >= 0, b > 0, and b is a power of 2 */
 {
@@ -87,7 +83,9 @@ round_up_pow2 (size_t a, size_t b)
 #ifndef WITH_SIMD
 #define ALIGN_SIZE  sizeof(double)
 #else
-#define ALIGN_SIZE  16 /* Most SIMD implementations require this */
+#define ALIGN_SIZE  32 /* Most of the SIMD instructions we support require
+                          16-byte (128-bit) alignment, but AVX2 requires
+                          32-byte alignment. */
 #endif
 #endif
 
@@ -102,7 +100,7 @@ round_up_pow2 (size_t a, size_t b)
 typedef struct small_pool_struct *small_pool_ptr;
 
 typedef struct small_pool_struct {
-  small_pool_ptr next;  /* next in list of pools */
+  small_pool_ptr next;          /* next in list of pools */
   size_t bytes_used;            /* how many bytes already used within pool */
   size_t bytes_left;            /* bytes still available in this pool */
 } small_pool_hdr;
@@ -110,7 +108,7 @@ typedef struct small_pool_struct {
 typedef struct large_pool_struct *large_pool_ptr;
 
 typedef struct large_pool_struct {
-  large_pool_ptr next;  /* next in list of pools */
+  large_pool_ptr next;          /* next in list of pools */
   size_t bytes_used;            /* how many bytes already used within pool */
   size_t bytes_left;            /* bytes still available in this pool */
 } large_pool_hdr;
@@ -189,9 +187,9 @@ struct jvirt_barray_control {
 #ifdef MEM_STATS                /* optional extra stuff for statistics */
 
 LOCAL(void)
-print_mem_stats (j_common_ptr cinfo, int pool_id)
+print_mem_stats(j_common_ptr cinfo, int pool_id)
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr shdr_ptr;
   large_pool_ptr lhdr_ptr;
 
@@ -204,15 +202,13 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 
   for (lhdr_ptr = mem->large_list[pool_id]; lhdr_ptr != NULL;
        lhdr_ptr = lhdr_ptr->next) {
-    fprintf(stderr, "  Large chunk used %ld\n",
-            (long) lhdr_ptr->bytes_used);
+    fprintf(stderr, "  Large chunk used %ld\n", (long)lhdr_ptr->bytes_used);
   }
 
   for (shdr_ptr = mem->small_list[pool_id]; shdr_ptr != NULL;
        shdr_ptr = shdr_ptr->next) {
     fprintf(stderr, "  Small chunk used %ld free %ld\n",
-            (long) shdr_ptr->bytes_used,
-            (long) shdr_ptr->bytes_left);
+            (long)shdr_ptr->bytes_used, (long)shdr_ptr->bytes_left);
   }
 }
 
@@ -220,7 +216,7 @@ print_mem_stats (j_common_ptr cinfo, int pool_id)
 
 
 LOCAL(void)
-out_of_memory (j_common_ptr cinfo, int which)
+out_of_memory(j_common_ptr cinfo, int which)
 /* Report an out-of-memory error and stop execution */
 /* If we compiled MEM_STATS support, report alloc requests before dying */
 {
@@ -248,26 +244,24 @@ out_of_memory (j_common_ptr cinfo, int which)
  * adjustment.
  */
 
-static const size_t first_pool_slop[JPOOL_NUMPOOLS] =
-{
-        1600,                   /* first PERMANENT pool */
-        16000                   /* first IMAGE pool */
+static const size_t first_pool_slop[JPOOL_NUMPOOLS] = {
+  1600,                         /* first PERMANENT pool */
+  16000                         /* first IMAGE pool */
 };
 
-static const size_t extra_pool_slop[JPOOL_NUMPOOLS] =
-{
-        0,                      /* additional PERMANENT pools */
-        5000                    /* additional IMAGE pools */
+static const size_t extra_pool_slop[JPOOL_NUMPOOLS] = {
+  0,                            /* additional PERMANENT pools */
+  5000                          /* additional IMAGE pools */
 };
 
 #define MIN_SLOP  50            /* greater than 0 to avoid futile looping */
 
 
 METHODDEF(void *)
-alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_small(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 /* Allocate a "small" object */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr hdr_ptr, prev_hdr_ptr;
   char *data_ptr;
   size_t min_request, slop;
@@ -311,11 +305,11 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
     else
       slop = extra_pool_slop[pool_id];
     /* Don't ask for more than MAX_ALLOC_CHUNK */
-    if (slop > (size_t) (MAX_ALLOC_CHUNK-min_request))
-      slop = (size_t) (MAX_ALLOC_CHUNK-min_request);
+    if (slop > (size_t)(MAX_ALLOC_CHUNK - min_request))
+      slop = (size_t)(MAX_ALLOC_CHUNK - min_request);
     /* Try to get space, if fail reduce slop and try again */
     for (;;) {
-      hdr_ptr = (small_pool_ptr) jpeg_get_small(cinfo, min_request + slop);
+      hdr_ptr = (small_pool_ptr)jpeg_get_small(cinfo, min_request + slop);
       if (hdr_ptr != NULL)
         break;
       slop /= 2;
@@ -334,7 +328,7 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   }
 
   /* OK, allocate the object from the current pool */
-  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
   data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
@@ -342,7 +336,7 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   hdr_ptr->bytes_used += sizeofobject;
   hdr_ptr->bytes_left -= sizeofobject;
 
-  return (void *) data_ptr;
+  return (void *)data_ptr;
 }
 
 
@@ -360,10 +354,10 @@ alloc_small (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  */
 
 METHODDEF(void *)
-alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
+alloc_large(j_common_ptr cinfo, int pool_id, size_t sizeofobject)
 /* Allocate a "large" object */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   large_pool_ptr hdr_ptr;
   char *data_ptr;
 
@@ -388,9 +382,9 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS)
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
-  hdr_ptr = (large_pool_ptr) jpeg_get_large(cinfo, sizeofobject +
-                                            sizeof(large_pool_hdr) +
-                                            ALIGN_SIZE - 1);
+  hdr_ptr = (large_pool_ptr)jpeg_get_large(cinfo, sizeofobject +
+                                           sizeof(large_pool_hdr) +
+                                           ALIGN_SIZE - 1);
   if (hdr_ptr == NULL)
     out_of_memory(cinfo, 4);    /* jpeg_get_large failed */
   mem->total_space_allocated += sizeofobject + sizeof(large_pool_hdr) +
@@ -405,12 +399,12 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
   hdr_ptr->bytes_left = 0;
   mem->large_list[pool_id] = hdr_ptr;
 
-  data_ptr = (char *) hdr_ptr; /* point to first data byte in pool... */
+  data_ptr = (char *)hdr_ptr; /* point to first data byte in pool... */
   data_ptr += sizeof(small_pool_hdr); /* ...by skipping the header... */
   if ((size_t)data_ptr % ALIGN_SIZE) /* ...and adjust for alignment */
     data_ptr += ALIGN_SIZE - (size_t)data_ptr % ALIGN_SIZE;
 
-  return (void *) data_ptr;
+  return (void *)data_ptr;
 }
 
 
@@ -431,11 +425,11 @@ alloc_large (j_common_ptr cinfo, int pool_id, size_t sizeofobject)
  */
 
 METHODDEF(JSAMPARRAY)
-alloc_sarray (j_common_ptr cinfo, int pool_id,
-              JDIMENSION samplesperrow, JDIMENSION numrows)
+alloc_sarray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+             JDIMENSION numrows)
 /* Allocate a 2-D sample array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   JSAMPARRAY result;
   JSAMPROW workspace;
   JDIMENSION rowsperchunk, currow, i;
@@ -454,27 +448,27 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
                                                            sizeof(JSAMPLE));
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
-          ((long) samplesperrow * sizeof(JSAMPLE));
+  ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+          ((long)samplesperrow * sizeof(JSAMPLE));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
-  if (ltemp < (long) numrows)
-    rowsperchunk = (JDIMENSION) ltemp;
+  if (ltemp < (long)numrows)
+    rowsperchunk = (JDIMENSION)ltemp;
   else
     rowsperchunk = numrows;
   mem->last_rowsperchunk = rowsperchunk;
 
   /* Get space for row pointers (small object) */
-  result = (JSAMPARRAY) alloc_small(cinfo, pool_id,
-                                    (size_t) (numrows * sizeof(JSAMPROW)));
+  result = (JSAMPARRAY)alloc_small(cinfo, pool_id,
+                                   (size_t)(numrows * sizeof(JSAMPROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-    workspace = (JSAMPROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) samplesperrow
-                  * sizeof(JSAMPLE)));
+    workspace = (JSAMPROW)alloc_large(cinfo, pool_id,
+      (size_t)((size_t)rowsperchunk * (size_t)samplesperrow *
+               sizeof(JSAMPLE)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += samplesperrow;
@@ -491,11 +485,11 @@ alloc_sarray (j_common_ptr cinfo, int pool_id,
  */
 
 METHODDEF(JBLOCKARRAY)
-alloc_barray (j_common_ptr cinfo, int pool_id,
-              JDIMENSION blocksperrow, JDIMENSION numrows)
+alloc_barray(j_common_ptr cinfo, int pool_id, JDIMENSION blocksperrow,
+             JDIMENSION numrows)
 /* Allocate a 2-D coefficient-block array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   JBLOCKARRAY result;
   JBLOCKROW workspace;
   JDIMENSION rowsperchunk, currow, i;
@@ -506,27 +500,27 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
     out_of_memory(cinfo, 6);    /* safety check */
 
   /* Calculate max # of rows allowed in one allocation chunk */
-  ltemp = (MAX_ALLOC_CHUNK-sizeof(large_pool_hdr)) /
-          ((long) blocksperrow * sizeof(JBLOCK));
+  ltemp = (MAX_ALLOC_CHUNK - sizeof(large_pool_hdr)) /
+          ((long)blocksperrow * sizeof(JBLOCK));
   if (ltemp <= 0)
     ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
-  if (ltemp < (long) numrows)
-    rowsperchunk = (JDIMENSION) ltemp;
+  if (ltemp < (long)numrows)
+    rowsperchunk = (JDIMENSION)ltemp;
   else
     rowsperchunk = numrows;
   mem->last_rowsperchunk = rowsperchunk;
 
   /* Get space for row pointers (small object) */
-  result = (JBLOCKARRAY) alloc_small(cinfo, pool_id,
-                                     (size_t) (numrows * sizeof(JBLOCKROW)));
+  result = (JBLOCKARRAY)alloc_small(cinfo, pool_id,
+                                    (size_t)(numrows * sizeof(JBLOCKROW)));
 
   /* Get the rows themselves (large objects) */
   currow = 0;
   while (currow < numrows) {
     rowsperchunk = MIN(rowsperchunk, numrows - currow);
-    workspace = (JBLOCKROW) alloc_large(cinfo, pool_id,
-        (size_t) ((size_t) rowsperchunk * (size_t) blocksperrow
-                  * sizeof(JBLOCK)));
+    workspace = (JBLOCKROW)alloc_large(cinfo, pool_id,
+        (size_t)((size_t)rowsperchunk * (size_t)blocksperrow *
+                  sizeof(JBLOCK)));
     for (i = rowsperchunk; i > 0; i--) {
       result[currow++] = workspace;
       workspace += blocksperrow;
@@ -575,12 +569,12 @@ alloc_barray (j_common_ptr cinfo, int pool_id,
 
 
 METHODDEF(jvirt_sarray_ptr)
-request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-                     JDIMENSION samplesperrow, JDIMENSION numrows,
-                     JDIMENSION maxaccess)
+request_virt_sarray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                    JDIMENSION samplesperrow, JDIMENSION numrows,
+                    JDIMENSION maxaccess)
 /* Request a virtual 2-D sample array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   jvirt_sarray_ptr result;
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -588,8 +582,8 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
-  result = (jvirt_sarray_ptr) alloc_small(cinfo, pool_id,
-                                          sizeof(struct jvirt_sarray_control));
+  result = (jvirt_sarray_ptr)alloc_small(cinfo, pool_id,
+                                         sizeof(struct jvirt_sarray_control));
 
   result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
@@ -605,12 +599,12 @@ request_virt_sarray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
 
 
 METHODDEF(jvirt_barray_ptr)
-request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
-                     JDIMENSION blocksperrow, JDIMENSION numrows,
-                     JDIMENSION maxaccess)
+request_virt_barray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                    JDIMENSION blocksperrow, JDIMENSION numrows,
+                    JDIMENSION maxaccess)
 /* Request a virtual 2-D coefficient-block array */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   jvirt_barray_ptr result;
 
   /* Only IMAGE-lifetime virtual arrays are currently supported */
@@ -618,8 +612,8 @@ request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
     ERREXIT1(cinfo, JERR_BAD_POOL_ID, pool_id); /* safety check */
 
   /* get control block */
-  result = (jvirt_barray_ptr) alloc_small(cinfo, pool_id,
-                                          sizeof(struct jvirt_barray_control));
+  result = (jvirt_barray_ptr)alloc_small(cinfo, pool_id,
+                                         sizeof(struct jvirt_barray_control));
 
   result->mem_buffer = NULL;    /* marks array not yet realized */
   result->rows_in_array = numrows;
@@ -635,10 +629,10 @@ request_virt_barray (j_common_ptr cinfo, int pool_id, boolean pre_zero,
 
 
 METHODDEF(void)
-realize_virt_arrays (j_common_ptr cinfo)
+realize_virt_arrays(j_common_ptr cinfo)
 /* Allocate the in-memory buffers for any unrealized virtual arrays */
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   size_t space_per_minheight, maximum_space, avail_mem;
   size_t minheights, max_minheights;
   jvirt_sarray_ptr sptr;
@@ -652,11 +646,11 @@ realize_virt_arrays (j_common_ptr cinfo)
   maximum_space = 0;
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
-      size_t new_space = (long) sptr->rows_in_array *
-                         (long) sptr->samplesperrow * sizeof(JSAMPLE);
+      size_t new_space = (long)sptr->rows_in_array *
+                         (long)sptr->samplesperrow * sizeof(JSAMPLE);
 
-      space_per_minheight += (long) sptr->maxaccess *
-                             (long) sptr->samplesperrow * sizeof(JSAMPLE);
+      space_per_minheight += (long)sptr->maxaccess *
+                             (long)sptr->samplesperrow * sizeof(JSAMPLE);
       if (SIZE_MAX - maximum_space < new_space)
         out_of_memory(cinfo, 10);
       maximum_space += new_space;
@@ -664,11 +658,11 @@ realize_virt_arrays (j_common_ptr cinfo)
   }
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
-      size_t new_space = (long) bptr->rows_in_array *
-                         (long) bptr->blocksperrow * sizeof(JBLOCK);
+      size_t new_space = (long)bptr->rows_in_array *
+                         (long)bptr->blocksperrow * sizeof(JBLOCK);
 
-      space_per_minheight += (long) bptr->maxaccess *
-                             (long) bptr->blocksperrow * sizeof(JBLOCK);
+      space_per_minheight += (long)bptr->maxaccess *
+                             (long)bptr->blocksperrow * sizeof(JBLOCK);
       if (SIZE_MAX - maximum_space < new_space)
         out_of_memory(cinfo, 11);
       maximum_space += new_space;
@@ -701,17 +695,17 @@ realize_virt_arrays (j_common_ptr cinfo)
 
   for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
     if (sptr->mem_buffer == NULL) { /* if not realized yet */
-      minheights = ((long) sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
+      minheights = ((long)sptr->rows_in_array - 1L) / sptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
         /* This buffer fits in memory */
         sptr->rows_in_mem = sptr->rows_in_array;
       } else {
         /* It doesn't fit in memory, create backing store. */
-        sptr->rows_in_mem = (JDIMENSION) (max_minheights * sptr->maxaccess);
-        jpeg_open_backing_store(cinfo, & sptr->b_s_info,
-                                (long) sptr->rows_in_array *
-                                (long) sptr->samplesperrow *
-                                (long) sizeof(JSAMPLE));
+        sptr->rows_in_mem = (JDIMENSION)(max_minheights * sptr->maxaccess);
+        jpeg_open_backing_store(cinfo, &sptr->b_s_info,
+                                (long)sptr->rows_in_array *
+                                (long)sptr->samplesperrow *
+                                (long)sizeof(JSAMPLE));
         sptr->b_s_open = TRUE;
       }
       sptr->mem_buffer = alloc_sarray(cinfo, JPOOL_IMAGE,
@@ -725,17 +719,17 @@ realize_virt_arrays (j_common_ptr cinfo)
 
   for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
     if (bptr->mem_buffer == NULL) { /* if not realized yet */
-      minheights = ((long) bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
+      minheights = ((long)bptr->rows_in_array - 1L) / bptr->maxaccess + 1L;
       if (minheights <= max_minheights) {
         /* This buffer fits in memory */
         bptr->rows_in_mem = bptr->rows_in_array;
       } else {
         /* It doesn't fit in memory, create backing store. */
-        bptr->rows_in_mem = (JDIMENSION) (max_minheights * bptr->maxaccess);
-        jpeg_open_backing_store(cinfo, & bptr->b_s_info,
-                                (long) bptr->rows_in_array *
-                                (long) bptr->blocksperrow *
-                                (long) sizeof(JBLOCK));
+        bptr->rows_in_mem = (JDIMENSION)(max_minheights * bptr->maxaccess);
+        jpeg_open_backing_store(cinfo, &bptr->b_s_info,
+                                (long)bptr->rows_in_array *
+                                (long)bptr->blocksperrow *
+                                (long)sizeof(JBLOCK));
         bptr->b_s_open = TRUE;
       }
       bptr->mem_buffer = alloc_barray(cinfo, JPOOL_IMAGE,
@@ -750,32 +744,32 @@ realize_virt_arrays (j_common_ptr cinfo)
 
 
 LOCAL(void)
-do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
+do_sarray_io(j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
 /* Do backing store read or write of a virtual sample array */
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->samplesperrow * sizeof(JSAMPLE);
+  bytesperrow = (long)ptr->samplesperrow * sizeof(JSAMPLE);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
-  for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+  for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
     /* One chunk, but check for short chunk at end of buffer */
-    rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+    rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
     /* Transfer no more than is currently defined */
-    thisrow = (long) ptr->cur_start_row + i;
-    rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+    thisrow = (long)ptr->cur_start_row + i;
+    rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
-    rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+    rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
     if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
-      (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-                                            (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                            (void *)ptr->mem_buffer[i],
                                             file_offset, byte_count);
     else
-      (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-                                           (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                           (void *)ptr->mem_buffer[i],
                                            file_offset, byte_count);
     file_offset += byte_count;
   }
@@ -783,32 +777,32 @@ do_sarray_io (j_common_ptr cinfo, jvirt_sarray_ptr ptr, boolean writing)
 
 
 LOCAL(void)
-do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
+do_barray_io(j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
 /* Do backing store read or write of a virtual coefficient-block array */
 {
   long bytesperrow, file_offset, byte_count, rows, thisrow, i;
 
-  bytesperrow = (long) ptr->blocksperrow * sizeof(JBLOCK);
+  bytesperrow = (long)ptr->blocksperrow * sizeof(JBLOCK);
   file_offset = ptr->cur_start_row * bytesperrow;
   /* Loop to read or write each allocation chunk in mem_buffer */
-  for (i = 0; i < (long) ptr->rows_in_mem; i += ptr->rowsperchunk) {
+  for (i = 0; i < (long)ptr->rows_in_mem; i += ptr->rowsperchunk) {
     /* One chunk, but check for short chunk at end of buffer */
-    rows = MIN((long) ptr->rowsperchunk, (long) ptr->rows_in_mem - i);
+    rows = MIN((long)ptr->rowsperchunk, (long)ptr->rows_in_mem - i);
     /* Transfer no more than is currently defined */
-    thisrow = (long) ptr->cur_start_row + i;
-    rows = MIN(rows, (long) ptr->first_undef_row - thisrow);
+    thisrow = (long)ptr->cur_start_row + i;
+    rows = MIN(rows, (long)ptr->first_undef_row - thisrow);
     /* Transfer no more than fits in file */
-    rows = MIN(rows, (long) ptr->rows_in_array - thisrow);
+    rows = MIN(rows, (long)ptr->rows_in_array - thisrow);
     if (rows <= 0)              /* this chunk might be past end of file! */
       break;
     byte_count = rows * bytesperrow;
     if (writing)
-      (*ptr->b_s_info.write_backing_store) (cinfo, & ptr->b_s_info,
-                                            (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.write_backing_store) (cinfo, &ptr->b_s_info,
+                                            (void *)ptr->mem_buffer[i],
                                             file_offset, byte_count);
     else
-      (*ptr->b_s_info.read_backing_store) (cinfo, & ptr->b_s_info,
-                                           (void *) ptr->mem_buffer[i],
+      (*ptr->b_s_info.read_backing_store) (cinfo, &ptr->b_s_info,
+                                           (void *)ptr->mem_buffer[i],
                                            file_offset, byte_count);
     file_offset += byte_count;
   }
@@ -816,9 +810,8 @@ do_barray_io (j_common_ptr cinfo, jvirt_barray_ptr ptr, boolean writing)
 
 
 METHODDEF(JSAMPARRAY)
-access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
-                    JDIMENSION start_row, JDIMENSION num_rows,
-                    boolean writable)
+access_virt_sarray(j_common_ptr cinfo, jvirt_sarray_ptr ptr,
+                   JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
 /* Access the part of a virtual sample array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -833,8 +826,8 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
 
   /* Make the desired part of the virtual array accessible */
   if (start_row < ptr->cur_start_row ||
-      end_row > ptr->cur_start_row+ptr->rows_in_mem) {
-    if (! ptr->b_s_open)
+      end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+    if (!ptr->b_s_open)
       ERREXIT(cinfo, JERR_VIRTUAL_BUG);
     /* Flush old buffer contents if necessary */
     if (ptr->dirty) {
@@ -854,10 +847,10 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
       /* use long arithmetic here to avoid overflow & unsigned problems */
       long ltemp;
 
-      ltemp = (long) end_row - (long) ptr->rows_in_mem;
+      ltemp = (long)end_row - (long)ptr->rows_in_mem;
       if (ltemp < 0)
         ltemp = 0;              /* don't fall off front end of file */
-      ptr->cur_start_row = (JDIMENSION) ltemp;
+      ptr->cur_start_row = (JDIMENSION)ltemp;
     }
     /* Read in the selected part of the array.
      * During the initial write pass, we will do no actual read
@@ -880,15 +873,15 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->samplesperrow * sizeof(JSAMPLE);
+      size_t bytesperrow = (size_t)ptr->samplesperrow * sizeof(JSAMPLE);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
         undef_row++;
       }
     } else {
-      if (! writable)           /* reader looking at undefined data */
+      if (!writable)            /* reader looking at undefined data */
         ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
@@ -901,9 +894,8 @@ access_virt_sarray (j_common_ptr cinfo, jvirt_sarray_ptr ptr,
 
 
 METHODDEF(JBLOCKARRAY)
-access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
-                    JDIMENSION start_row, JDIMENSION num_rows,
-                    boolean writable)
+access_virt_barray(j_common_ptr cinfo, jvirt_barray_ptr ptr,
+                   JDIMENSION start_row, JDIMENSION num_rows, boolean writable)
 /* Access the part of a virtual block array starting at start_row */
 /* and extending for num_rows rows.  writable is true if  */
 /* caller intends to modify the accessed area. */
@@ -918,8 +910,8 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
 
   /* Make the desired part of the virtual array accessible */
   if (start_row < ptr->cur_start_row ||
-      end_row > ptr->cur_start_row+ptr->rows_in_mem) {
-    if (! ptr->b_s_open)
+      end_row > ptr->cur_start_row + ptr->rows_in_mem) {
+    if (!ptr->b_s_open)
       ERREXIT(cinfo, JERR_VIRTUAL_BUG);
     /* Flush old buffer contents if necessary */
     if (ptr->dirty) {
@@ -939,10 +931,10 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
       /* use long arithmetic here to avoid overflow & unsigned problems */
       long ltemp;
 
-      ltemp = (long) end_row - (long) ptr->rows_in_mem;
+      ltemp = (long)end_row - (long)ptr->rows_in_mem;
       if (ltemp < 0)
         ltemp = 0;              /* don't fall off front end of file */
-      ptr->cur_start_row = (JDIMENSION) ltemp;
+      ptr->cur_start_row = (JDIMENSION)ltemp;
     }
     /* Read in the selected part of the array.
      * During the initial write pass, we will do no actual read
@@ -965,15 +957,15 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
     if (writable)
       ptr->first_undef_row = end_row;
     if (ptr->pre_zero) {
-      size_t bytesperrow = (size_t) ptr->blocksperrow * sizeof(JBLOCK);
+      size_t bytesperrow = (size_t)ptr->blocksperrow * sizeof(JBLOCK);
       undef_row -= ptr->cur_start_row; /* make indexes relative to buffer */
       end_row -= ptr->cur_start_row;
       while (undef_row < end_row) {
-        jzero_far((void *) ptr->mem_buffer[undef_row], bytesperrow);
+        jzero_far((void *)ptr->mem_buffer[undef_row], bytesperrow);
         undef_row++;
       }
     } else {
-      if (! writable)           /* reader looking at undefined data */
+      if (!writable)            /* reader looking at undefined data */
         ERREXIT(cinfo, JERR_BAD_VIRTUAL_ACCESS);
     }
   }
@@ -990,9 +982,9 @@ access_virt_barray (j_common_ptr cinfo, jvirt_barray_ptr ptr,
  */
 
 METHODDEF(void)
-free_pool (j_common_ptr cinfo, int pool_id)
+free_pool(j_common_ptr cinfo, int pool_id)
 {
-  my_mem_ptr mem = (my_mem_ptr) cinfo->mem;
+  my_mem_ptr mem = (my_mem_ptr)cinfo->mem;
   small_pool_ptr shdr_ptr;
   large_pool_ptr lhdr_ptr;
   size_t space_freed;
@@ -1013,14 +1005,14 @@ free_pool (j_common_ptr cinfo, int pool_id)
     for (sptr = mem->virt_sarray_list; sptr != NULL; sptr = sptr->next) {
       if (sptr->b_s_open) {     /* there may be no backing store */
         sptr->b_s_open = FALSE; /* prevent recursive close if error */
-        (*sptr->b_s_info.close_backing_store) (cinfo, & sptr->b_s_info);
+        (*sptr->b_s_info.close_backing_store) (cinfo, &sptr->b_s_info);
       }
     }
     mem->virt_sarray_list = NULL;
     for (bptr = mem->virt_barray_list; bptr != NULL; bptr = bptr->next) {
       if (bptr->b_s_open) {     /* there may be no backing store */
         bptr->b_s_open = FALSE; /* prevent recursive close if error */
-        (*bptr->b_s_info.close_backing_store) (cinfo, & bptr->b_s_info);
+        (*bptr->b_s_info.close_backing_store) (cinfo, &bptr->b_s_info);
       }
     }
     mem->virt_barray_list = NULL;
@@ -1034,8 +1026,8 @@ free_pool (j_common_ptr cinfo, int pool_id)
     large_pool_ptr next_lhdr_ptr = lhdr_ptr->next;
     space_freed = lhdr_ptr->bytes_used +
                   lhdr_ptr->bytes_left +
-                  sizeof(large_pool_hdr);
-    jpeg_free_large(cinfo, (void *) lhdr_ptr, space_freed);
+                  sizeof(large_pool_hdr) + ALIGN_SIZE - 1;
+    jpeg_free_large(cinfo, (void *)lhdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     lhdr_ptr = next_lhdr_ptr;
   }
@@ -1046,10 +1038,9 @@ free_pool (j_common_ptr cinfo, int pool_id)
 
   while (shdr_ptr != NULL) {
     small_pool_ptr next_shdr_ptr = shdr_ptr->next;
-    space_freed = shdr_ptr->bytes_used +
-                  shdr_ptr->bytes_left +
-                  sizeof(small_pool_hdr);
-    jpeg_free_small(cinfo, (void *) shdr_ptr, space_freed);
+    space_freed = shdr_ptr->bytes_used + shdr_ptr->bytes_left +
+                  sizeof(small_pool_hdr) + ALIGN_SIZE - 1;
+    jpeg_free_small(cinfo, (void *)shdr_ptr, space_freed);
     mem->total_space_allocated -= space_freed;
     shdr_ptr = next_shdr_ptr;
   }
@@ -1062,7 +1053,7 @@ free_pool (j_common_ptr cinfo, int pool_id)
  */
 
 METHODDEF(void)
-self_destruct (j_common_ptr cinfo)
+self_destruct(j_common_ptr cinfo)
 {
   int pool;
 
@@ -1070,12 +1061,12 @@ self_destruct (j_common_ptr cinfo)
    * Releasing pools in reverse order might help avoid fragmentation
    * with some (brain-damaged) malloc libraries.
    */
-  for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
     free_pool(cinfo, pool);
   }
 
   /* Release the memory manager control block too. */
-  jpeg_free_small(cinfo, (void *) cinfo->mem, sizeof(my_memory_mgr));
+  jpeg_free_small(cinfo, (void *)cinfo->mem, sizeof(my_memory_mgr));
   cinfo->mem = NULL;            /* ensures I will be called only once */
 
   jpeg_mem_term(cinfo);         /* system-dependent cleanup */
@@ -1088,7 +1079,7 @@ self_destruct (j_common_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_memory_mgr (j_common_ptr cinfo)
+jinit_memory_mgr(j_common_ptr cinfo)
 {
   my_mem_ptr mem;
   long max_to_use;
@@ -1104,22 +1095,22 @@ jinit_memory_mgr (j_common_ptr cinfo)
    * in common if and only if X is a power of 2, ie has only one one-bit.
    * Some compilers may give an "unreachable code" warning here; ignore it.
    */
-  if ((ALIGN_SIZE & (ALIGN_SIZE-1)) != 0)
+  if ((ALIGN_SIZE & (ALIGN_SIZE - 1)) != 0)
     ERREXIT(cinfo, JERR_BAD_ALIGN_TYPE);
   /* MAX_ALLOC_CHUNK must be representable as type size_t, and must be
    * a multiple of ALIGN_SIZE.
    * Again, an "unreachable code" warning may be ignored here.
    * But a "constant too large" warning means you need to fix MAX_ALLOC_CHUNK.
    */
-  test_mac = (size_t) MAX_ALLOC_CHUNK;
-  if ((long) test_mac != MAX_ALLOC_CHUNK ||
+  test_mac = (size_t)MAX_ALLOC_CHUNK;
+  if ((long)test_mac != MAX_ALLOC_CHUNK ||
       (MAX_ALLOC_CHUNK % ALIGN_SIZE) != 0)
     ERREXIT(cinfo, JERR_BAD_ALLOC_CHUNK);
 
   max_to_use = jpeg_mem_init(cinfo); /* system-dependent initialization */
 
   /* Attempt to allocate memory manager's control block */
-  mem = (my_mem_ptr) jpeg_get_small(cinfo, sizeof(my_memory_mgr));
+  mem = (my_mem_ptr)jpeg_get_small(cinfo, sizeof(my_memory_mgr));
 
   if (mem == NULL) {
     jpeg_mem_term(cinfo);       /* system-dependent cleanup */
@@ -1145,7 +1136,7 @@ jinit_memory_mgr (j_common_ptr cinfo)
   /* Initialize working state */
   mem->pub.max_memory_to_use = max_to_use;
 
-  for (pool = JPOOL_NUMPOOLS-1; pool >= JPOOL_PERMANENT; pool--) {
+  for (pool = JPOOL_NUMPOOLS - 1; pool >= JPOOL_PERMANENT; pool--) {
     mem->small_list[pool] = NULL;
     mem->large_list[pool] = NULL;
   }
@@ -1155,7 +1146,7 @@ jinit_memory_mgr (j_common_ptr cinfo)
   mem->total_space_allocated = sizeof(my_memory_mgr);
 
   /* Declare ourselves open for business */
-  cinfo->mem = & mem->pub;
+  cinfo->mem = &mem->pub;
 
   /* Check for an environment variable JPEGMEM; if found, override the
    * default max_memory setting from jpeg_mem_init.  Note that the
@@ -1164,12 +1155,17 @@ jinit_memory_mgr (j_common_ptr cinfo)
    * this feature.
    */
 #ifndef NO_GETENV
-  { char *memenv;
+  {
+    char memenv[30] = { 0 };
 
-    if ((memenv = getenv("JPEGMEM")) != NULL) {
+    if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) {
       char ch = 'x';
 
+#ifdef _MSC_VER
+      if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) {
+#else
       if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
+#endif
         if (ch == 'm' || ch == 'M')
           max_to_use *= 1000L;
         mem->pub.max_memory_to_use = max_to_use * 1000L;
diff --git a/media/libjpeg/jmemnobs.c b/media/libjpeg/jmemnobs.c
index 5797198de8..cd6571ba1c 100644
--- a/media/libjpeg/jmemnobs.c
+++ b/media/libjpeg/jmemnobs.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1992-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code and
- * information relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2017-2018, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -15,7 +15,6 @@
  * This is very portable in the sense that it'll compile on almost anything,
  * but you'd better have lots of main memory (or virtual memory) if you want
  * to process big images.
- * Note that the max_memory_to_use option is ignored by this implementation.
  */
 
 #define JPEG_INTERNALS
@@ -23,11 +22,6 @@
 #include "jpeglib.h"
 #include "jmemsys.h"            /* import the system-dependent declarations */
 
-#ifndef HAVE_STDLIB_H           /* <stdlib.h> should declare malloc(),free() */
-extern void *malloc (size_t size);
-extern void free (void *ptr);
-#endif
-
 
 /*
  * Memory allocation and freeing are controlled by the regular library
@@ -35,13 +29,13 @@ extern void free (void *ptr);
  */
 
 GLOBAL(void *)
-jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject)
 {
-  return (void *) malloc(sizeofobject);
+  return (void *)malloc(sizeofobject);
 }
 
 GLOBAL(void)
-jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_small(j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -52,13 +46,13 @@ jpeg_free_small (j_common_ptr cinfo, void *object, size_t sizeofobject)
  */
 
 GLOBAL(void *)
-jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject)
+jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject)
 {
-  return (void *) malloc(sizeofobject);
+  return (void *)malloc(sizeofobject);
 }
 
 GLOBAL(void)
-jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
+jpeg_free_large(j_common_ptr cinfo, void *object, size_t sizeofobject)
 {
   free(object);
 }
@@ -66,14 +60,21 @@ jpeg_free_large (j_common_ptr cinfo, void *object, size_t sizeofobject)
 
 /*
  * This routine computes the total memory space available for allocation.
- * Here we always say, "we got all you want bud!"
  */
 
 GLOBAL(size_t)
-jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
-                    size_t max_bytes_needed, size_t already_allocated)
+jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+                   size_t max_bytes_needed, size_t already_allocated)
 {
-  return max_bytes_needed;
+  if (cinfo->mem->max_memory_to_use) {
+    if ((size_t)cinfo->mem->max_memory_to_use > already_allocated)
+      return cinfo->mem->max_memory_to_use - already_allocated;
+    else
+      return 0;
+  } else {
+    /* Here we always say, "we got all you want bud!" */
+    return max_bytes_needed;
+  }
 }
 
 
@@ -84,8 +85,8 @@ jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
  */
 
 GLOBAL(void)
-jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
-                         long total_bytes_needed)
+jpeg_open_backing_store(j_common_ptr cinfo, backing_store_ptr info,
+                        long total_bytes_needed)
 {
   ERREXIT(cinfo, JERR_NO_BACKING_STORE);
 }
@@ -97,13 +98,13 @@ jpeg_open_backing_store (j_common_ptr cinfo, backing_store_ptr info,
  */
 
 GLOBAL(long)
-jpeg_mem_init (j_common_ptr cinfo)
+jpeg_mem_init(j_common_ptr cinfo)
 {
   return 0;                     /* just set max_memory_to_use to 0 */
 }
 
 GLOBAL(void)
-jpeg_mem_term (j_common_ptr cinfo)
+jpeg_mem_term(j_common_ptr cinfo)
 {
   /* no work */
 }
diff --git a/media/libjpeg/jmemsys.h b/media/libjpeg/jmemsys.h
index f7dfe87a83..9229550afd 100644
--- a/media/libjpeg/jmemsys.h
+++ b/media/libjpeg/jmemsys.h
@@ -31,9 +31,9 @@
  * size of the object being freed, just in case it's needed.
  */
 
-EXTERN(void *) jpeg_get_small (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
-                              size_t sizeofobject);
+EXTERN(void *) jpeg_get_small(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_small(j_common_ptr cinfo, void *object,
+                             size_t sizeofobject);
 
 /*
  * These two functions are used to allocate and release large chunks of
@@ -43,9 +43,9 @@ EXTERN(void) jpeg_free_small (j_common_ptr cinfo, void *object,
  * large chunks.
  */
 
-EXTERN(void *) jpeg_get_large (j_common_ptr cinfo, size_t sizeofobject);
-EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
-                              size_t sizeofobject);
+EXTERN(void *) jpeg_get_large(j_common_ptr cinfo, size_t sizeofobject);
+EXTERN(void) jpeg_free_large(j_common_ptr cinfo, void *object,
+                             size_t sizeofobject);
 
 /*
  * The macro MAX_ALLOC_CHUNK designates the maximum number of bytes that may
@@ -84,9 +84,9 @@ EXTERN(void) jpeg_free_large (j_common_ptr cinfo, void *object,
  * Conversely, zero may be returned to always use the minimum amount of memory.
  */
 
-EXTERN(size_t) jpeg_mem_available (j_common_ptr cinfo, size_t min_bytes_needed,
-                                   size_t max_bytes_needed,
-                                   size_t already_allocated);
+EXTERN(size_t) jpeg_mem_available(j_common_ptr cinfo, size_t min_bytes_needed,
+                                  size_t max_bytes_needed,
+                                  size_t already_allocated);
 
 
 /*
@@ -157,9 +157,9 @@ typedef struct backing_store_struct {
  * just take an error exit.)
  */
 
-EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
-                                      backing_store_ptr info,
-                                      long total_bytes_needed);
+EXTERN(void) jpeg_open_backing_store(j_common_ptr cinfo,
+                                     backing_store_ptr info,
+                                     long total_bytes_needed);
 
 
 /*
@@ -174,5 +174,5 @@ EXTERN(void) jpeg_open_backing_store (j_common_ptr cinfo,
  * all opened backing-store objects have been closed.
  */
 
-EXTERN(long) jpeg_mem_init (j_common_ptr cinfo);
-EXTERN(void) jpeg_mem_term (j_common_ptr cinfo);
+EXTERN(long) jpeg_mem_init(j_common_ptr cinfo);
+EXTERN(void) jpeg_mem_term(j_common_ptr cinfo);
diff --git a/media/libjpeg/jmorecfg.h b/media/libjpeg/jmorecfg.h
index d73b1090d5..8cda8041b2 100644
--- a/media/libjpeg/jmorecfg.h
+++ b/media/libjpeg/jmorecfg.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2011, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2011, 2014-2015, 2018, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -18,9 +18,9 @@
 
 /*
  * Maximum number of components (color channels) allowed in JPEG image.
- * To meet the letter of the JPEG spec, set this to 255.  However, darn
- * few applications need more than 4 channels (maybe 5 for CMYK + alpha
- * mask).  We recommend 10 as a reasonable compromise; use 4 if you are
+ * To meet the letter of Rec. ITU-T T.81 | ISO/IEC 10918-1, set this to 255.
+ * However, darn few applications need more than 4 channels (maybe 5 for CMYK +
+ * alpha mask).  We recommend 10 as a reasonable compromise; use 4 if you are
  * really short on memory.  (Each allowed component costs a hundred or so
  * bytes of storage, whether actually used in an image or not.)
  */
@@ -44,24 +44,10 @@
 
 #if BITS_IN_JSAMPLE == 8
 /* JSAMPLE should be the smallest type that will hold the values 0..255.
- * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JSAMPLE;
-#define GETJSAMPLE(value)  ((int) (value))
-
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JSAMPLE;
-#ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value)  ((int) (value))
-#else
-#define GETJSAMPLE(value)  ((int) (value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
+#define GETJSAMPLE(value)  ((int)(value))
 
 #define MAXJSAMPLE      255
 #define CENTERJSAMPLE   128
@@ -75,7 +61,7 @@ typedef char JSAMPLE;
  */
 
 typedef short JSAMPLE;
-#define GETJSAMPLE(value)  ((int) (value))
+#define GETJSAMPLE(value)  ((int)(value))
 
 #define MAXJSAMPLE      4095
 #define CENTERJSAMPLE   2048
@@ -98,22 +84,9 @@ typedef short JCOEF;
  * managers, this is also the data type passed to fread/fwrite.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JOCTET;
 #define GETJOCTET(value)  (value)
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JOCTET;
-#ifdef __CHAR_UNSIGNED__
-#define GETJOCTET(value)  (value)
-#else
-#define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 
 /* These typedefs are used for various table entries and so forth.
  * They must be at least as wide as specified; but making them too big
@@ -199,7 +172,7 @@ typedef unsigned int JDIMENSION;
  * software out there that uses it.
  */
 
-#define JMETHOD(type,methodname,arglist)  type (*methodname) arglist
+#define JMETHOD(type, methodname, arglist)  type (*methodname) arglist
 
 
 /* libjpeg-turbo no longer supports platforms that have far symbols (MS-DOS),
@@ -252,9 +225,9 @@ typedef int boolean;
 
 /* Capability options common to encoder and decoder: */
 
-#define DCT_ISLOW_SUPPORTED     /* slow but accurate integer algorithm */
-#define DCT_IFAST_SUPPORTED     /* faster, less accurate integer method */
-#define DCT_FLOAT_SUPPORTED     /* floating-point: accurate, fast on fast HW */
+#define DCT_ISLOW_SUPPORTED     /* accurate integer method */
+#define DCT_IFAST_SUPPORTED     /* less accurate int method [legacy feature] */
+#define DCT_FLOAT_SUPPORTED     /* floating-point method [legacy feature] */
 
 /* Encoder capability options: */
 
@@ -294,10 +267,10 @@ typedef int boolean;
  * with it.  In reality, few people ever did this, because there were some
  * severe restrictions involved (cjpeg and djpeg no longer worked properly,
  * compressing/decompressing RGB JPEGs no longer worked properly, and the color
- * quantizer wouldn't work with pixel sizes other than 3.)  Further, since all
- * of the O/S-supplied versions of libjpeg were built with the default values
- * of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications have
- * come to regard these values as immutable.
+ * quantizer wouldn't work with pixel sizes other than 3.)  Furthermore, since
+ * all of the O/S-supplied versions of libjpeg were built with the default
+ * values of RGB_RED, RGB_GREEN, RGB_BLUE, and RGB_PIXELSIZE, many applications
+ * have come to regard these values as immutable.
  *
  * The libjpeg-turbo colorspace extensions provide a much cleaner way of
  * compressing from/decompressing to buffers with arbitrary component orders
@@ -312,37 +285,37 @@ typedef int boolean;
 #define RGB_BLUE        2       /* Offset of Blue */
 #define RGB_PIXELSIZE   3       /* JSAMPLEs per RGB scanline element */
 
-#define JPEG_NUMCS 17
+#define JPEG_NUMCS  17
 
-#define EXT_RGB_RED        0
-#define EXT_RGB_GREEN      1
-#define EXT_RGB_BLUE       2
-#define EXT_RGB_PIXELSIZE  3
+#define EXT_RGB_RED         0
+#define EXT_RGB_GREEN       1
+#define EXT_RGB_BLUE        2
+#define EXT_RGB_PIXELSIZE   3
 
-#define EXT_RGBX_RED       0
-#define EXT_RGBX_GREEN     1
-#define EXT_RGBX_BLUE      2
-#define EXT_RGBX_PIXELSIZE 4
+#define EXT_RGBX_RED        0
+#define EXT_RGBX_GREEN      1
+#define EXT_RGBX_BLUE       2
+#define EXT_RGBX_PIXELSIZE  4
 
-#define EXT_BGR_RED        2
-#define EXT_BGR_GREEN      1
-#define EXT_BGR_BLUE       0
-#define EXT_BGR_PIXELSIZE  3
+#define EXT_BGR_RED         2
+#define EXT_BGR_GREEN       1
+#define EXT_BGR_BLUE        0
+#define EXT_BGR_PIXELSIZE   3
 
-#define EXT_BGRX_RED       2
-#define EXT_BGRX_GREEN     1
-#define EXT_BGRX_BLUE      0
-#define EXT_BGRX_PIXELSIZE 4
+#define EXT_BGRX_RED        2
+#define EXT_BGRX_GREEN      1
+#define EXT_BGRX_BLUE       0
+#define EXT_BGRX_PIXELSIZE  4
 
-#define EXT_XBGR_RED       3
-#define EXT_XBGR_GREEN     2
-#define EXT_XBGR_BLUE      1
-#define EXT_XBGR_PIXELSIZE 4
+#define EXT_XBGR_RED        3
+#define EXT_XBGR_GREEN      2
+#define EXT_XBGR_BLUE       1
+#define EXT_XBGR_PIXELSIZE  4
 
-#define EXT_XRGB_RED       1
-#define EXT_XRGB_GREEN     2
-#define EXT_XRGB_BLUE      3
-#define EXT_XRGB_PIXELSIZE 4
+#define EXT_XRGB_RED        1
+#define EXT_XRGB_GREEN      2
+#define EXT_XRGB_BLUE       3
+#define EXT_XRGB_PIXELSIZE  4
 
 static const int rgb_red[JPEG_NUMCS] = {
   -1, -1, RGB_RED, -1, -1, -1, EXT_RGB_RED, EXT_RGBX_RED,
@@ -383,7 +356,7 @@ static const int rgb_pixelsize[JPEG_NUMCS] = {
 #ifndef WITH_SIMD
 #define MULTIPLIER  int         /* type for fastest integer multiply */
 #else
-#define MULTIPLIER short  /* prefer 16-bit with SIMD for parellelism */
+#define MULTIPLIER  short       /* prefer 16-bit with SIMD for parellelism */
 #endif
 #endif
 
diff --git a/media/libjpeg/jpegcomp.h b/media/libjpeg/jpegcomp.h
index ade0d1edcd..c4834ac0df 100644
--- a/media/libjpeg/jpegcomp.h
+++ b/media/libjpeg/jpegcomp.h
@@ -1,7 +1,7 @@
 /*
  * jpegcomp.h
  *
- * Copyright (C) 2010, D. R. Commander.
+ * Copyright (C) 2010, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -11,21 +11,22 @@
  */
 
 #if JPEG_LIB_VERSION >= 70
-#define _DCT_scaled_size DCT_h_scaled_size
-#define _DCT_h_scaled_size DCT_h_scaled_size
-#define _DCT_v_scaled_size DCT_v_scaled_size
-#define _min_DCT_scaled_size min_DCT_h_scaled_size
-#define _min_DCT_h_scaled_size min_DCT_h_scaled_size
-#define _min_DCT_v_scaled_size min_DCT_v_scaled_size
-#define _jpeg_width jpeg_width
-#define _jpeg_height jpeg_height
+#define _DCT_scaled_size  DCT_h_scaled_size
+#define _DCT_h_scaled_size  DCT_h_scaled_size
+#define _DCT_v_scaled_size  DCT_v_scaled_size
+#define _min_DCT_scaled_size  min_DCT_h_scaled_size
+#define _min_DCT_h_scaled_size  min_DCT_h_scaled_size
+#define _min_DCT_v_scaled_size  min_DCT_v_scaled_size
+#define _jpeg_width  jpeg_width
+#define _jpeg_height  jpeg_height
+#define JERR_ARITH_NOTIMPL  JERR_NOT_COMPILED
 #else
-#define _DCT_scaled_size DCT_scaled_size
-#define _DCT_h_scaled_size DCT_scaled_size
-#define _DCT_v_scaled_size DCT_scaled_size
-#define _min_DCT_scaled_size min_DCT_scaled_size
-#define _min_DCT_h_scaled_size min_DCT_scaled_size
-#define _min_DCT_v_scaled_size min_DCT_scaled_size
-#define _jpeg_width image_width
-#define _jpeg_height image_height
+#define _DCT_scaled_size  DCT_scaled_size
+#define _DCT_h_scaled_size  DCT_scaled_size
+#define _DCT_v_scaled_size  DCT_scaled_size
+#define _min_DCT_scaled_size  min_DCT_scaled_size
+#define _min_DCT_h_scaled_size  min_DCT_scaled_size
+#define _min_DCT_v_scaled_size  min_DCT_scaled_size
+#define _jpeg_width  image_width
+#define _jpeg_height  image_height
 #endif
diff --git a/media/libjpeg/jpegint.h b/media/libjpeg/jpegint.h
index 9979a912d9..6af9e2a179 100644
--- a/media/libjpeg/jpegint.h
+++ b/media/libjpeg/jpegint.h
@@ -5,8 +5,9 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2019, 2021, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2021, Alex Richardson.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -27,33 +28,45 @@ typedef enum {            /* Operating modes for buffer controllers */
 } J_BUF_MODE;
 
 /* Values of global_state field (jdapi.c has some dependencies on ordering!) */
-#define CSTATE_START    100     /* after create_compress */
-#define CSTATE_SCANNING 101     /* start_compress done, write_scanlines OK */
-#define CSTATE_RAW_OK   102     /* start_compress done, write_raw_data OK */
-#define CSTATE_WRCOEFS  103     /* jpeg_write_coefficients done */
-#define DSTATE_START    200     /* after create_decompress */
-#define DSTATE_INHEADER 201     /* reading header markers, no SOS yet */
-#define DSTATE_READY    202     /* found SOS, ready for start_decompress */
-#define DSTATE_PRELOAD  203     /* reading multiscan file in start_decompress*/
-#define DSTATE_PRESCAN  204     /* performing dummy pass for 2-pass quant */
-#define DSTATE_SCANNING 205     /* start_decompress done, read_scanlines OK */
-#define DSTATE_RAW_OK   206     /* start_decompress done, read_raw_data OK */
-#define DSTATE_BUFIMAGE 207     /* expecting jpeg_start_output */
-#define DSTATE_BUFPOST  208     /* looking for SOS/EOI in jpeg_finish_output */
-#define DSTATE_RDCOEFS  209     /* reading file in jpeg_read_coefficients */
-#define DSTATE_STOPPING 210     /* looking for EOI in jpeg_finish_decompress */
+#define CSTATE_START     100    /* after create_compress */
+#define CSTATE_SCANNING  101    /* start_compress done, write_scanlines OK */
+#define CSTATE_RAW_OK    102    /* start_compress done, write_raw_data OK */
+#define CSTATE_WRCOEFS   103    /* jpeg_write_coefficients done */
+#define DSTATE_START     200    /* after create_decompress */
+#define DSTATE_INHEADER  201    /* reading header markers, no SOS yet */
+#define DSTATE_READY     202    /* found SOS, ready for start_decompress */
+#define DSTATE_PRELOAD   203    /* reading multiscan file in start_decompress*/
+#define DSTATE_PRESCAN   204    /* performing dummy pass for 2-pass quant */
+#define DSTATE_SCANNING  205    /* start_decompress done, read_scanlines OK */
+#define DSTATE_RAW_OK    206    /* start_decompress done, read_raw_data OK */
+#define DSTATE_BUFIMAGE  207    /* expecting jpeg_start_output */
+#define DSTATE_BUFPOST   208    /* looking for SOS/EOI in jpeg_finish_output */
+#define DSTATE_RDCOEFS   209    /* reading file in jpeg_read_coefficients */
+#define DSTATE_STOPPING  210    /* looking for EOI in jpeg_finish_decompress */
 
 
 /* JLONG must hold at least signed 32-bit values. */
 typedef long JLONG;
 
+/* JUINTPTR must hold pointer values. */
+#ifdef __UINTPTR_TYPE__
+/*
+ * __UINTPTR_TYPE__ is GNU-specific and available in GCC 4.6+ and Clang 3.0+.
+ * Fortunately, that is sufficient to support the few architectures for which
+ * sizeof(void *) != sizeof(size_t).  The only other options would require C99
+ * or Clang-specific builtins.
+ */
+typedef __UINTPTR_TYPE__ JUINTPTR;
+#else
+typedef size_t JUINTPTR;
+#endif
 
 /*
  * Left shift macro that handles a negative operand without causing any
  * sanitizer warnings
  */
 
-#define LEFT_SHIFT(a, b) ((JLONG)((unsigned long)(a) << (b)))
+#define LEFT_SHIFT(a, b)  ((JLONG)((unsigned long)(a) << (b)))
 
 
 /* Declarations for compression modules */
@@ -158,6 +171,9 @@ struct jpeg_decomp_master {
   JDIMENSION first_MCU_col[MAX_COMPONENTS];
   JDIMENSION last_MCU_col[MAX_COMPONENTS];
   boolean jinit_upsampler_no_alloc;
+
+  /* Last iMCU row that was successfully decoded */
+  JDIMENSION last_good_iMCU_row;
 };
 
 /* Input control module */
@@ -274,9 +290,9 @@ struct jpeg_color_quantizer {
 /* Miscellaneous useful macros */
 
 #undef MAX
-#define MAX(a,b)        ((a) > (b) ? (a) : (b))
+#define MAX(a, b)       ((a) > (b) ? (a) : (b))
 #undef MIN
-#define MIN(a,b)        ((a) < (b) ? (a) : (b))
+#define MIN(a, b)       ((a) < (b) ? (a) : (b))
 
 
 /* We assume that right shift corresponds to signed division by 2 with
@@ -291,64 +307,64 @@ struct jpeg_color_quantizer {
 
 #ifdef RIGHT_SHIFT_IS_UNSIGNED
 #define SHIFT_TEMPS     JLONG shift_temp;
-#define RIGHT_SHIFT(x,shft)  \
-        ((shift_temp = (x)) < 0 ? \
-         (shift_temp >> (shft)) | ((~((JLONG) 0)) << (32-(shft))) : \
-         (shift_temp >> (shft)))
+#define RIGHT_SHIFT(x, shft) \
+  ((shift_temp = (x)) < 0 ? \
+   (shift_temp >> (shft)) | ((~((JLONG)0)) << (32 - (shft))) : \
+   (shift_temp >> (shft)))
 #else
 #define SHIFT_TEMPS
-#define RIGHT_SHIFT(x,shft)     ((x) >> (shft))
+#define RIGHT_SHIFT(x, shft)    ((x) >> (shft))
 #endif
 
 
 /* Compression module initialization routines */
-EXTERN(void) jinit_compress_master (j_compress_ptr cinfo);
-EXTERN(void) jinit_c_master_control (j_compress_ptr cinfo,
-                                     boolean transcode_only);
-EXTERN(void) jinit_c_main_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_c_prep_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_c_coef_controller (j_compress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_color_converter (j_compress_ptr cinfo);
-EXTERN(void) jinit_downsampler (j_compress_ptr cinfo);
-EXTERN(void) jinit_forward_dct (j_compress_ptr cinfo);
-EXTERN(void) jinit_huff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_phuff_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_arith_encoder (j_compress_ptr cinfo);
-EXTERN(void) jinit_marker_writer (j_compress_ptr cinfo);
+EXTERN(void) jinit_compress_master(j_compress_ptr cinfo);
+EXTERN(void) jinit_c_master_control(j_compress_ptr cinfo,
+                                    boolean transcode_only);
+EXTERN(void) jinit_c_main_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_c_prep_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_c_coef_controller(j_compress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_color_converter(j_compress_ptr cinfo);
+EXTERN(void) jinit_downsampler(j_compress_ptr cinfo);
+EXTERN(void) jinit_forward_dct(j_compress_ptr cinfo);
+EXTERN(void) jinit_huff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_phuff_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_arith_encoder(j_compress_ptr cinfo);
+EXTERN(void) jinit_marker_writer(j_compress_ptr cinfo);
 /* Decompression module initialization routines */
-EXTERN(void) jinit_master_decompress (j_decompress_ptr cinfo);
-EXTERN(void) jinit_d_main_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_d_coef_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_d_post_controller (j_decompress_ptr cinfo,
-                                      boolean need_full_buffer);
-EXTERN(void) jinit_input_controller (j_decompress_ptr cinfo);
-EXTERN(void) jinit_marker_reader (j_decompress_ptr cinfo);
-EXTERN(void) jinit_huff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_phuff_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_arith_decoder (j_decompress_ptr cinfo);
-EXTERN(void) jinit_inverse_dct (j_decompress_ptr cinfo);
-EXTERN(void) jinit_upsampler (j_decompress_ptr cinfo);
-EXTERN(void) jinit_color_deconverter (j_decompress_ptr cinfo);
-EXTERN(void) jinit_1pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_2pass_quantizer (j_decompress_ptr cinfo);
-EXTERN(void) jinit_merged_upsampler (j_decompress_ptr cinfo);
+EXTERN(void) jinit_master_decompress(j_decompress_ptr cinfo);
+EXTERN(void) jinit_d_main_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_d_coef_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_d_post_controller(j_decompress_ptr cinfo,
+                                     boolean need_full_buffer);
+EXTERN(void) jinit_input_controller(j_decompress_ptr cinfo);
+EXTERN(void) jinit_marker_reader(j_decompress_ptr cinfo);
+EXTERN(void) jinit_huff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_phuff_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_arith_decoder(j_decompress_ptr cinfo);
+EXTERN(void) jinit_inverse_dct(j_decompress_ptr cinfo);
+EXTERN(void) jinit_upsampler(j_decompress_ptr cinfo);
+EXTERN(void) jinit_color_deconverter(j_decompress_ptr cinfo);
+EXTERN(void) jinit_1pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_2pass_quantizer(j_decompress_ptr cinfo);
+EXTERN(void) jinit_merged_upsampler(j_decompress_ptr cinfo);
 /* Memory manager initialization */
-EXTERN(void) jinit_memory_mgr (j_common_ptr cinfo);
+EXTERN(void) jinit_memory_mgr(j_common_ptr cinfo);
 
 /* Utility routines in jutils.c */
-EXTERN(long) jdiv_round_up (long a, long b);
-EXTERN(long) jround_up (long a, long b);
-EXTERN(void) jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-                                JSAMPARRAY output_array, int dest_row,
-                                int num_rows, JDIMENSION num_cols);
-EXTERN(void) jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
-                              JDIMENSION num_blocks);
-EXTERN(void) jzero_far (void *target, size_t bytestozero);
+EXTERN(long) jdiv_round_up(long a, long b);
+EXTERN(long) jround_up(long a, long b);
+EXTERN(void) jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+                               JSAMPARRAY output_array, int dest_row,
+                               int num_rows, JDIMENSION num_cols);
+EXTERN(void) jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+                             JDIMENSION num_blocks);
+EXTERN(void) jzero_far(void *target, size_t bytestozero);
 /* Constant tables in jutils.c */
 #if 0                           /* This table is not actually needed in v6a */
 extern const int jpeg_zigzag_order[]; /* natural coef order to zigzag order */
@@ -357,12 +373,3 @@ extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
 
 /* Arithmetic coding probability estimation tables in jaricom.c */
 extern const JLONG jpeg_aritab[];
-
-/* Suppress undefined-structure complaints if necessary. */
-
-#ifdef INCOMPLETE_TYPES_BROKEN
-#ifndef AM_MEMORY_MANAGER       /* only jmemmgr.c defines these */
-struct jvirt_sarray_control { long dummy; };
-struct jvirt_barray_control { long dummy; };
-#endif
-#endif /* INCOMPLETE_TYPES_BROKEN */
diff --git a/media/libjpeg/jpeglib.h b/media/libjpeg/jpeglib.h
index 6c63f58222..d7664f0630 100644
--- a/media/libjpeg/jpeglib.h
+++ b/media/libjpeg/jpeglib.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016-2017, 2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -211,8 +211,8 @@ struct jpeg_marker_struct {
 
 /* Known color spaces. */
 
-#define JCS_EXTENSIONS 1
-#define JCS_ALPHA_EXTENSIONS 1
+#define JCS_EXTENSIONS  1
+#define JCS_ALPHA_EXTENSIONS  1
 
 typedef enum {
   JCS_UNKNOWN,            /* error/unspecified */
@@ -244,9 +244,9 @@ typedef enum {
 /* DCT/IDCT algorithm options. */
 
 typedef enum {
-  JDCT_ISLOW,             /* slow but accurate integer algorithm */
-  JDCT_IFAST,             /* faster, less accurate integer method */
-  JDCT_FLOAT              /* floating-point: accurate, fast on fast HW */
+  JDCT_ISLOW,             /* accurate integer method */
+  JDCT_IFAST,             /* less accurate integer method [legacy feature] */
+  JDCT_FLOAT              /* floating-point method [legacy feature] */
 } J_DCT_METHOD;
 
 #ifndef JDCT_DEFAULT            /* may be overridden in jconfig.h */
@@ -268,11 +268,11 @@ typedef enum {
 /* Common fields between JPEG compression and decompression master structs. */
 
 #define jpeg_common_fields \
-  struct jpeg_error_mgr *err;   /* Error handler module */\
-  struct jpeg_memory_mgr *mem;  /* Memory manager module */\
-  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */\
-  void *client_data;            /* Available for use by application */\
-  boolean is_decompressor;      /* So common code can tell which is which */\
+  struct jpeg_error_mgr *err;   /* Error handler module */ \
+  struct jpeg_memory_mgr *mem;  /* Memory manager module */ \
+  struct jpeg_progress_mgr *progress; /* Progress monitor, or NULL if none */ \
+  void *client_data;            /* Available for use by application */ \
+  boolean is_decompressor;      /* So common code can tell which is which */ \
   int global_state              /* For checking call sequence validity */
 
 /* Routines that are to be used by both halves of the library are declared
@@ -822,9 +822,9 @@ struct jpeg_source_mgr {
  * successful.
  */
 
-#define JPOOL_PERMANENT 0       /* lasts until master record is destroyed */
-#define JPOOL_IMAGE     1       /* lasts until done with image/datastream */
-#define JPOOL_NUMPOOLS  2
+#define JPOOL_PERMANENT  0      /* lasts until master record is destroyed */
+#define JPOOL_IMAGE      1      /* lasts until done with image/datastream */
+#define JPOOL_NUMPOOLS   2
 
 typedef struct jvirt_sarray_control *jvirt_sarray_ptr;
 typedef struct jvirt_barray_control *jvirt_barray_ptr;
@@ -888,7 +888,7 @@ typedef boolean (*jpeg_marker_parser_method) (j_decompress_ptr cinfo);
 
 
 /* Default error-management setup */
-EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
+EXTERN(struct jpeg_error_mgr *) jpeg_std_error(struct jpeg_error_mgr *err);
 
 /* Initialization of JPEG compression objects.
  * jpeg_create_compress() and jpeg_create_decompress() are the exported
@@ -898,90 +898,95 @@ EXTERN(struct jpeg_error_mgr *) jpeg_std_error (struct jpeg_error_mgr *err);
  * NB: you must set up the error-manager BEFORE calling jpeg_create_xxx.
  */
 #define jpeg_create_compress(cinfo) \
-    jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
-                        (size_t) sizeof(struct jpeg_compress_struct))
+  jpeg_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+                      (size_t)sizeof(struct jpeg_compress_struct))
 #define jpeg_create_decompress(cinfo) \
-    jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
-                          (size_t) sizeof(struct jpeg_decompress_struct))
-EXTERN(void) jpeg_CreateCompress (j_compress_ptr cinfo, int version,
-                                  size_t structsize);
-EXTERN(void) jpeg_CreateDecompress (j_decompress_ptr cinfo, int version,
-                                    size_t structsize);
+  jpeg_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+                        (size_t)sizeof(struct jpeg_decompress_struct))
+EXTERN(void) jpeg_CreateCompress(j_compress_ptr cinfo, int version,
+                                 size_t structsize);
+EXTERN(void) jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+                                   size_t structsize);
 /* Destruction of JPEG compression objects */
-EXTERN(void) jpeg_destroy_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_destroy_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_destroy_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_destroy_decompress(j_decompress_ptr cinfo);
 
 /* Standard data source and destination managers: stdio streams. */
 /* Caller is responsible for opening the file before and closing after. */
-EXTERN(void) jpeg_stdio_dest (j_compress_ptr cinfo, FILE *outfile);
-EXTERN(void) jpeg_stdio_src (j_decompress_ptr cinfo, FILE *infile);
+EXTERN(void) jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile);
+EXTERN(void) jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile);
 
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
 /* Data source and destination managers: memory buffers. */
-EXTERN(void) jpeg_mem_dest (j_compress_ptr cinfo, unsigned char **outbuffer,
-                            unsigned long *outsize);
-EXTERN(void) jpeg_mem_src (j_decompress_ptr cinfo,
-                           const unsigned char *inbuffer,
-                           unsigned long insize);
+EXTERN(void) jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+                           unsigned long *outsize);
+EXTERN(void) jpeg_mem_src(j_decompress_ptr cinfo,
+                          const unsigned char *inbuffer, unsigned long insize);
 #endif
 
 /* Default parameter setup for compression */
-EXTERN(void) jpeg_set_defaults (j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_defaults(j_compress_ptr cinfo);
 /* Compression parameter setup aids */
-EXTERN(void) jpeg_set_colorspace (j_compress_ptr cinfo,
-                                  J_COLOR_SPACE colorspace);
-EXTERN(void) jpeg_default_colorspace (j_compress_ptr cinfo);
-EXTERN(void) jpeg_set_quality (j_compress_ptr cinfo, int quality,
-                               boolean force_baseline);
-EXTERN(void) jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
-                                      boolean force_baseline);
+EXTERN(void) jpeg_set_colorspace(j_compress_ptr cinfo,
+                                 J_COLOR_SPACE colorspace);
+EXTERN(void) jpeg_default_colorspace(j_compress_ptr cinfo);
+EXTERN(void) jpeg_set_quality(j_compress_ptr cinfo, int quality,
+                              boolean force_baseline);
+EXTERN(void) jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                                     boolean force_baseline);
 #if JPEG_LIB_VERSION >= 70
-EXTERN(void) jpeg_default_qtables (j_compress_ptr cinfo,
-                                   boolean force_baseline);
+EXTERN(void) jpeg_default_qtables(j_compress_ptr cinfo,
+                                  boolean force_baseline);
 #endif
-EXTERN(void) jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
-                                   const unsigned int *basic_table,
-                                   int scale_factor, boolean force_baseline);
-EXTERN(int) jpeg_quality_scaling (int quality);
-EXTERN(void) jpeg_simple_progression (j_compress_ptr cinfo);
-EXTERN(void) jpeg_suppress_tables (j_compress_ptr cinfo, boolean suppress);
-EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table (j_common_ptr cinfo);
-EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table (j_common_ptr cinfo);
+EXTERN(void) jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                                  const unsigned int *basic_table,
+                                  int scale_factor, boolean force_baseline);
+EXTERN(int) jpeg_quality_scaling(int quality);
+EXTERN(void) jpeg_simple_progression(j_compress_ptr cinfo);
+EXTERN(void) jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+EXTERN(JQUANT_TBL *) jpeg_alloc_quant_table(j_common_ptr cinfo);
+EXTERN(JHUFF_TBL *) jpeg_alloc_huff_table(j_common_ptr cinfo);
 
 /* Main entry points for compression */
-EXTERN(void) jpeg_start_compress (j_compress_ptr cinfo,
-                                  boolean write_all_tables);
-EXTERN(JDIMENSION) jpeg_write_scanlines (j_compress_ptr cinfo,
-                                         JSAMPARRAY scanlines,
-                                         JDIMENSION num_lines);
-EXTERN(void) jpeg_finish_compress (j_compress_ptr cinfo);
+EXTERN(void) jpeg_start_compress(j_compress_ptr cinfo,
+                                 boolean write_all_tables);
+EXTERN(JDIMENSION) jpeg_write_scanlines(j_compress_ptr cinfo,
+                                        JSAMPARRAY scanlines,
+                                        JDIMENSION num_lines);
+EXTERN(void) jpeg_finish_compress(j_compress_ptr cinfo);
 
 #if JPEG_LIB_VERSION >= 70
 /* Precalculate JPEG dimensions for current compression parameters. */
-EXTERN(void) jpeg_calc_jpeg_dimensions (j_compress_ptr cinfo);
+EXTERN(void) jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo);
 #endif
 
 /* Replaces jpeg_write_scanlines when writing raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_write_raw_data (j_compress_ptr cinfo, JSAMPIMAGE data,
-                                        JDIMENSION num_lines);
+EXTERN(JDIMENSION) jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                       JDIMENSION num_lines);
 
 /* Write a special marker.  See libjpeg.txt concerning safe usage. */
-EXTERN(void) jpeg_write_marker (j_compress_ptr cinfo, int marker,
-                                const JOCTET *dataptr, unsigned int datalen);
+EXTERN(void) jpeg_write_marker(j_compress_ptr cinfo, int marker,
+                               const JOCTET *dataptr, unsigned int datalen);
 /* Same, but piecemeal. */
-EXTERN(void) jpeg_write_m_header (j_compress_ptr cinfo, int marker,
-                                  unsigned int datalen);
-EXTERN(void) jpeg_write_m_byte (j_compress_ptr cinfo, int val);
+EXTERN(void) jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+                                 unsigned int datalen);
+EXTERN(void) jpeg_write_m_byte(j_compress_ptr cinfo, int val);
 
 /* Alternate compression function: just write an abbreviated table file */
-EXTERN(void) jpeg_write_tables (j_compress_ptr cinfo);
+EXTERN(void) jpeg_write_tables(j_compress_ptr cinfo);
+
+/* Write ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(void) jpeg_write_icc_profile(j_compress_ptr cinfo,
+                                    const JOCTET *icc_data_ptr,
+                                    unsigned int icc_data_len);
+
 
 /* Decompression startup: read start of JPEG datastream to see what's there */
-EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
+EXTERN(int) jpeg_read_header(j_decompress_ptr cinfo, boolean require_image);
 /* Return value is one of: */
-#define JPEG_SUSPENDED          0 /* Suspended due to lack of input data */
-#define JPEG_HEADER_OK          1 /* Found valid image datastream */
-#define JPEG_HEADER_TABLES_ONLY 2 /* Found valid table-specs-only datastream */
+#define JPEG_SUSPENDED           0 /* Suspended due to lack of input data */
+#define JPEG_HEADER_OK           1 /* Found valid image datastream */
+#define JPEG_HEADER_TABLES_ONLY  2 /* Found valid table-specs-only datastream */
 /* If you pass require_image = TRUE (normal case), you need not check for
  * a TABLES_ONLY return code; an abbreviated file will cause an error exit.
  * JPEG_SUSPENDED is only possible if you use a data source module that can
@@ -989,27 +994,27 @@ EXTERN(int) jpeg_read_header (j_decompress_ptr cinfo, boolean require_image);
  */
 
 /* Main entry points for decompression */
-EXTERN(boolean) jpeg_start_decompress (j_decompress_ptr cinfo);
-EXTERN(JDIMENSION) jpeg_read_scanlines (j_decompress_ptr cinfo,
-                                        JSAMPARRAY scanlines,
-                                        JDIMENSION max_lines);
-EXTERN(JDIMENSION) jpeg_skip_scanlines (j_decompress_ptr cinfo,
-                                        JDIMENSION num_lines);
-EXTERN(void) jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
-                                 JDIMENSION *width);
-EXTERN(boolean) jpeg_finish_decompress (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_decompress(j_decompress_ptr cinfo);
+EXTERN(JDIMENSION) jpeg_read_scanlines(j_decompress_ptr cinfo,
+                                       JSAMPARRAY scanlines,
+                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_skip_scanlines(j_decompress_ptr cinfo,
+                                       JDIMENSION num_lines);
+EXTERN(void) jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                                JDIMENSION *width);
+EXTERN(boolean) jpeg_finish_decompress(j_decompress_ptr cinfo);
 
 /* Replaces jpeg_read_scanlines when reading raw downsampled data. */
-EXTERN(JDIMENSION) jpeg_read_raw_data (j_decompress_ptr cinfo, JSAMPIMAGE data,
-                                       JDIMENSION max_lines);
+EXTERN(JDIMENSION) jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                      JDIMENSION max_lines);
 
 /* Additional entry points for buffered-image mode. */
-EXTERN(boolean) jpeg_has_multiple_scans (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_start_output (j_decompress_ptr cinfo, int scan_number);
-EXTERN(boolean) jpeg_finish_output (j_decompress_ptr cinfo);
-EXTERN(boolean) jpeg_input_complete (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_new_colormap (j_decompress_ptr cinfo);
-EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_has_multiple_scans(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_start_output(j_decompress_ptr cinfo, int scan_number);
+EXTERN(boolean) jpeg_finish_output(j_decompress_ptr cinfo);
+EXTERN(boolean) jpeg_input_complete(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_new_colormap(j_decompress_ptr cinfo);
+EXTERN(int) jpeg_consume_input(j_decompress_ptr cinfo);
 /* Return value is one of: */
 /* #define JPEG_SUSPENDED       0    Suspended due to lack of input data */
 #define JPEG_REACHED_SOS        1 /* Reached start of new scan */
@@ -1019,25 +1024,25 @@ EXTERN(int) jpeg_consume_input (j_decompress_ptr cinfo);
 
 /* Precalculate output dimensions for current decompression parameters. */
 #if JPEG_LIB_VERSION >= 80
-EXTERN(void) jpeg_core_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_core_output_dimensions(j_decompress_ptr cinfo);
 #endif
-EXTERN(void) jpeg_calc_output_dimensions (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_calc_output_dimensions(j_decompress_ptr cinfo);
 
 /* Control saving of COM and APPn markers into marker_list. */
-EXTERN(void) jpeg_save_markers (j_decompress_ptr cinfo, int marker_code,
-                                unsigned int length_limit);
+EXTERN(void) jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                               unsigned int length_limit);
 
 /* Install a special processing method for COM or APPn markers. */
-EXTERN(void) jpeg_set_marker_processor (j_decompress_ptr cinfo,
-                                        int marker_code,
-                                        jpeg_marker_parser_method routine);
+EXTERN(void) jpeg_set_marker_processor(j_decompress_ptr cinfo,
+                                       int marker_code,
+                                       jpeg_marker_parser_method routine);
 
 /* Read or write raw DCT coefficients --- useful for lossless transcoding. */
-EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients (j_decompress_ptr cinfo);
-EXTERN(void) jpeg_write_coefficients (j_compress_ptr cinfo,
-                                      jvirt_barray_ptr *coef_arrays);
-EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
-                                            j_compress_ptr dstinfo);
+EXTERN(jvirt_barray_ptr *) jpeg_read_coefficients(j_decompress_ptr cinfo);
+EXTERN(void) jpeg_write_coefficients(j_compress_ptr cinfo,
+                                     jvirt_barray_ptr *coef_arrays);
+EXTERN(void) jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                           j_compress_ptr dstinfo);
 
 /* If you choose to abort compression or decompression before completing
  * jpeg_finish_(de)compress, then you need to clean up to release memory,
@@ -1045,17 +1050,22 @@ EXTERN(void) jpeg_copy_critical_parameters (j_decompress_ptr srcinfo,
  * if you're done with the JPEG object, but if you want to clean it up and
  * reuse it, call this:
  */
-EXTERN(void) jpeg_abort_compress (j_compress_ptr cinfo);
-EXTERN(void) jpeg_abort_decompress (j_decompress_ptr cinfo);
+EXTERN(void) jpeg_abort_compress(j_compress_ptr cinfo);
+EXTERN(void) jpeg_abort_decompress(j_decompress_ptr cinfo);
 
 /* Generic versions of jpeg_abort and jpeg_destroy that work on either
  * flavor of JPEG object.  These may be more convenient in some places.
  */
-EXTERN(void) jpeg_abort (j_common_ptr cinfo);
-EXTERN(void) jpeg_destroy (j_common_ptr cinfo);
+EXTERN(void) jpeg_abort(j_common_ptr cinfo);
+EXTERN(void) jpeg_destroy(j_common_ptr cinfo);
 
 /* Default restart-marker-resync procedure for use by data source modules */
-EXTERN(boolean) jpeg_resync_to_restart (j_decompress_ptr cinfo, int desired);
+EXTERN(boolean) jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+/* Read ICC profile.  See libjpeg.txt for usage information. */
+EXTERN(boolean) jpeg_read_icc_profile(j_decompress_ptr cinfo,
+                                      JOCTET **icc_data_ptr,
+                                      unsigned int *icc_data_len);
 
 
 /* These marker codes are exported since applications and data source modules
diff --git a/media/libjpeg/jquant1.c b/media/libjpeg/jquant1.c
index e7814815ef..73b83e16e5 100644
--- a/media/libjpeg/jquant1.c
+++ b/media/libjpeg/jquant1.c
@@ -73,8 +73,9 @@
 
 #define ODITHER_SIZE  16        /* dimension of dither matrix */
 /* NB: if ODITHER_SIZE is not a power of 2, ODITHER_MASK uses will break */
-#define ODITHER_CELLS (ODITHER_SIZE*ODITHER_SIZE)       /* # cells in matrix */
-#define ODITHER_MASK  (ODITHER_SIZE-1) /* mask for wrapping around counters */
+#define ODITHER_CELLS  (ODITHER_SIZE * ODITHER_SIZE) /* # cells in matrix */
+#define ODITHER_MASK  (ODITHER_SIZE - 1) /* mask for wrapping around
+                                            counters */
 
 typedef int ODITHER_MATRIX[ODITHER_SIZE][ODITHER_SIZE];
 typedef int (*ODITHER_MATRIX_PTR)[ODITHER_SIZE];
@@ -132,12 +133,12 @@ typedef JLONG FSERROR;          /* may need more than 16 bits */
 typedef JLONG LOCFSERROR;       /* be sure calculation temps are big enough */
 #endif
 
-typedef FSERROR *FSERRPTR;  /* pointer to error array */
+typedef FSERROR *FSERRPTR;      /* pointer to error array */
 
 
 /* Private subobject */
 
-#define MAX_Q_COMPS 4           /* max components I can handle */
+#define MAX_Q_COMPS  4          /* max components I can handle */
 
 typedef struct {
   struct jpeg_color_quantizer pub; /* public fields */
@@ -153,7 +154,7 @@ typedef struct {
    */
   boolean is_padded;            /* is the colorindex padded for odither? */
 
-  int Ncolors[MAX_Q_COMPS];     /* # of values alloced to each component */
+  int Ncolors[MAX_Q_COMPS];     /* # of values allocated to each component */
 
   /* Variables for ordered dithering */
   int row_index;                /* cur row's vertical index in dither matrix */
@@ -183,7 +184,7 @@ typedef my_cquantizer *my_cquantize_ptr;
 
 
 LOCAL(int)
-select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
+select_ncolors(j_decompress_ptr cinfo, int Ncolors[])
 /* Determine allocation of desired colors to components, */
 /* and fill in Ncolors[] array to indicate choice. */
 /* Return value is total number of colors (product of Ncolors[] values). */
@@ -206,12 +207,12 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
     temp = iroot;               /* set temp = iroot ** nc */
     for (i = 1; i < nc; i++)
       temp *= iroot;
-  } while (temp <= (long) max_colors); /* repeat till iroot exceeds root */
+  } while (temp <= (long)max_colors); /* repeat till iroot exceeds root */
   iroot--;                      /* now iroot = floor(root) */
 
   /* Must have at least 2 color values per component */
   if (iroot < 2)
-    ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int) temp);
+    ERREXIT1(cinfo, JERR_QUANT_FEW_COLORS, (int)temp);
 
   /* Initialize to iroot color values for each component */
   total_colors = 1;
@@ -231,11 +232,11 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
       j = (cinfo->out_color_space == JCS_RGB ? RGB_order[i] : i);
       /* calculate new total_colors if Ncolors[j] is incremented */
       temp = total_colors / Ncolors[j];
-      temp *= Ncolors[j]+1;     /* done in long arith to avoid oflo */
-      if (temp > (long) max_colors)
+      temp *= Ncolors[j] + 1;   /* done in long arith to avoid oflo */
+      if (temp > (long)max_colors)
         break;                  /* won't fit, done with this pass */
       Ncolors[j]++;             /* OK, apply the increment */
-      total_colors = (int) temp;
+      total_colors = (int)temp;
       changed = TRUE;
     }
   } while (changed);
@@ -245,7 +246,7 @@ select_ncolors (j_decompress_ptr cinfo, int Ncolors[])
 
 
 LOCAL(int)
-output_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+output_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return j'th output value, where j will range from 0 to maxj */
 /* The output values must fall in 0..MAXJSAMPLE in increasing order */
 {
@@ -254,17 +255,17 @@ output_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
    * (Forcing the upper and lower values to the limits ensures that
    * dithering can't produce a color outside the selected gamut.)
    */
-  return (int) (((JLONG) j * MAXJSAMPLE + maxj/2) / maxj);
+  return (int)(((JLONG)j * MAXJSAMPLE + maxj / 2) / maxj);
 }
 
 
 LOCAL(int)
-largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
+largest_input_value(j_decompress_ptr cinfo, int ci, int j, int maxj)
 /* Return largest input value that should map to j'th output value */
 /* Must have largest(j=0) >= 0, and largest(j=maxj) >= MAXJSAMPLE */
 {
   /* Breakpoints are halfway between values returned by output_value */
-  return (int) (((JLONG) (2*j + 1) * MAXJSAMPLE + maxj) / (2*maxj));
+  return (int)(((JLONG)(2 * j + 1) * MAXJSAMPLE + maxj) / (2 * maxj));
 }
 
 
@@ -273,21 +274,21 @@ largest_input_value (j_decompress_ptr cinfo, int ci, int j, int maxj)
  */
 
 LOCAL(void)
-create_colormap (j_decompress_ptr cinfo)
+create_colormap(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPARRAY colormap;          /* Created colormap */
   int total_colors;             /* Number of distinct output colors */
-  int i,j,k, nci, blksize, blkdist, ptr, val;
+  int i, j, k, nci, blksize, blkdist, ptr, val;
 
   /* Select number of colors for each component */
   total_colors = select_ncolors(cinfo, cquantize->Ncolors);
 
   /* Report selected color counts */
   if (cinfo->out_color_components == 3)
-    TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS,
-             total_colors, cquantize->Ncolors[0],
-             cquantize->Ncolors[1], cquantize->Ncolors[2]);
+    TRACEMS4(cinfo, 1, JTRC_QUANT_3_NCOLORS, total_colors,
+             cquantize->Ncolors[0], cquantize->Ncolors[1],
+             cquantize->Ncolors[2]);
   else
     TRACEMS1(cinfo, 1, JTRC_QUANT_NCOLORS, total_colors);
 
@@ -296,8 +297,8 @@ create_colormap (j_decompress_ptr cinfo)
   /* i.e. rightmost (highest-indexed) color changes most rapidly. */
 
   colormap = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) total_colors, (JDIMENSION) cinfo->out_color_components);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)total_colors, (JDIMENSION)cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
   /* blkdist is distance between groups of identical entries for a component */
@@ -309,12 +310,12 @@ create_colormap (j_decompress_ptr cinfo)
     blksize = blkdist / nci;
     for (j = 0; j < nci; j++) {
       /* Compute j'th output value (out of nci) for component */
-      val = output_value(cinfo, i, j, nci-1);
+      val = output_value(cinfo, i, j, nci - 1);
       /* Fill in all colormap entries that have this value of this component */
       for (ptr = j * blksize; ptr < total_colors; ptr += blkdist) {
         /* fill in blksize entries beginning at ptr */
         for (k = 0; k < blksize; k++)
-          colormap[i][ptr+k] = (JSAMPLE) val;
+          colormap[i][ptr + k] = (JSAMPLE)val;
       }
     }
     blkdist = blksize;          /* blksize of this color is blkdist of next */
@@ -333,11 +334,11 @@ create_colormap (j_decompress_ptr cinfo)
  */
 
 LOCAL(void)
-create_colorindex (j_decompress_ptr cinfo)
+create_colorindex(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPROW indexptr;
-  int i,j,k, nci, blksize, val, pad;
+  int i, j, k, nci, blksize, val, pad;
 
   /* For ordered dither, we pad the color index tables by MAXJSAMPLE in
    * each direction (input index values can be -MAXJSAMPLE .. 2*MAXJSAMPLE).
@@ -345,7 +346,7 @@ create_colorindex (j_decompress_ptr cinfo)
    * flag whether it was done in case user changes dithering mode.
    */
   if (cinfo->dither_mode == JDITHER_ORDERED) {
-    pad = MAXJSAMPLE*2;
+    pad = MAXJSAMPLE * 2;
     cquantize->is_padded = TRUE;
   } else {
     pad = 0;
@@ -353,9 +354,9 @@ create_colorindex (j_decompress_ptr cinfo)
   }
 
   cquantize->colorindex = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE,
-     (JDIMENSION) (MAXJSAMPLE+1 + pad),
-     (JDIMENSION) cinfo->out_color_components);
+    ((j_common_ptr)cinfo, JPOOL_IMAGE,
+     (JDIMENSION)(MAXJSAMPLE + 1 + pad),
+     (JDIMENSION)cinfo->out_color_components);
 
   /* blksize is number of adjacent repeated entries for a component */
   blksize = cquantize->sv_actual;
@@ -373,18 +374,18 @@ create_colorindex (j_decompress_ptr cinfo)
     /* and k = largest j that maps to current val */
     indexptr = cquantize->colorindex[i];
     val = 0;
-    k = largest_input_value(cinfo, i, 0, nci-1);
+    k = largest_input_value(cinfo, i, 0, nci - 1);
     for (j = 0; j <= MAXJSAMPLE; j++) {
       while (j > k)             /* advance val if past boundary */
-        k = largest_input_value(cinfo, i, ++val, nci-1);
+        k = largest_input_value(cinfo, i, ++val, nci - 1);
       /* premultiply so that no multiplication needed in main processing */
-      indexptr[j] = (JSAMPLE) (val * blksize);
+      indexptr[j] = (JSAMPLE)(val * blksize);
     }
     /* Pad at both ends if necessary */
     if (pad)
       for (j = 1; j <= MAXJSAMPLE; j++) {
         indexptr[-j] = indexptr[0];
-        indexptr[MAXJSAMPLE+j] = indexptr[MAXJSAMPLE];
+        indexptr[MAXJSAMPLE + j] = indexptr[MAXJSAMPLE];
       }
   }
 }
@@ -396,29 +397,29 @@ create_colorindex (j_decompress_ptr cinfo)
  */
 
 LOCAL(ODITHER_MATRIX_PTR)
-make_odither_array (j_decompress_ptr cinfo, int ncolors)
+make_odither_array(j_decompress_ptr cinfo, int ncolors)
 {
   ODITHER_MATRIX_PTR odither;
-  int j,k;
-  JLONG num,den;
+  int j, k;
+  JLONG num, den;
 
   odither = (ODITHER_MATRIX_PTR)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(ODITHER_MATRIX));
   /* The inter-value distance for this color is MAXJSAMPLE/(ncolors-1).
    * Hence the dither value for the matrix cell with fill order f
    * (f=0..N-1) should be (N-1-2*f)/(2*N) * MAXJSAMPLE/(ncolors-1).
    * On 16-bit-int machine, be careful to avoid overflow.
    */
-  den = 2 * ODITHER_CELLS * ((JLONG) (ncolors - 1));
+  den = 2 * ODITHER_CELLS * ((JLONG)(ncolors - 1));
   for (j = 0; j < ODITHER_SIZE; j++) {
     for (k = 0; k < ODITHER_SIZE; k++) {
-      num = ((JLONG) (ODITHER_CELLS-1 - 2*((int)base_dither_matrix[j][k])))
-            * MAXJSAMPLE;
+      num = ((JLONG)(ODITHER_CELLS - 1 -
+                     2 * ((int)base_dither_matrix[j][k]))) * MAXJSAMPLE;
       /* Ensure round towards zero despite C's lack of consistency
        * about rounding negative values in integer division...
        */
-      odither[j][k] = (int) (num<0 ? -((-num)/den) : num/den);
+      odither[j][k] = (int)(num < 0 ? -((-num) / den) : num / den);
     }
   }
   return odither;
@@ -432,9 +433,9 @@ make_odither_array (j_decompress_ptr cinfo, int ncolors)
  */
 
 LOCAL(void)
-create_odither_tables (j_decompress_ptr cinfo)
+create_odither_tables(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   ODITHER_MATRIX_PTR odither;
   int i, j, nci;
 
@@ -459,11 +460,11 @@ create_odither_tables (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                JSAMPARRAY output_buf, int num_rows)
+color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+               JSAMPARRAY output_buf, int num_rows)
 /* General case, no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   JSAMPARRAY colorindex = cquantize->colorindex;
   register int pixcode, ci;
   register JSAMPROW ptrin, ptrout;
@@ -478,20 +479,20 @@ color_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     for (col = width; col > 0; col--) {
       pixcode = 0;
       for (ci = 0; ci < nc; ci++) {
-        pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+        pixcode += colorindex[ci][*ptrin++];
       }
-      *ptrout++ = (JSAMPLE) pixcode;
+      *ptrout++ = (JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                 JSAMPARRAY output_buf, int num_rows)
+color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
   register JSAMPROW ptrin, ptrout;
   JSAMPROW colorindex0 = cquantize->colorindex[0];
@@ -505,21 +506,21 @@ color_quantize3 (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptrin = input_buf[row];
     ptrout = output_buf[row];
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
-      *ptrout++ = (JSAMPLE) pixcode;
+      pixcode  = colorindex0[*ptrin++];
+      pixcode += colorindex1[*ptrin++];
+      pixcode += colorindex2[*ptrin++];
+      *ptrout++ = (JSAMPLE)pixcode;
     }
   }
 }
 
 
 METHODDEF(void)
-quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                     JSAMPARRAY output_buf, int num_rows)
+quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                    JSAMPARRAY output_buf, int num_rows)
 /* General case, with ordered dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
   JSAMPROW colorindex_ci;
@@ -533,7 +534,7 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
     row_index = cquantize->row_index;
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
@@ -550,7 +551,8 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * inputs.  The maximum dither is +- MAXJSAMPLE; this sets the
          * required amount of padding.
          */
-        *output_ptr += colorindex_ci[GETJSAMPLE(*input_ptr)+dither[col_index]];
+        *output_ptr +=
+          colorindex_ci[*input_ptr + dither[col_index]];
         input_ptr += nc;
         output_ptr++;
         col_index = (col_index + 1) & ODITHER_MASK;
@@ -564,11 +566,11 @@ quantize_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
 
 METHODDEF(void)
-quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                      JSAMPARRAY output_buf, int num_rows)
+quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPARRAY output_buf, int num_rows)
 /* Fast path for out_color_components==3, with ordered dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register int pixcode;
   register JSAMPROW input_ptr;
   register JSAMPROW output_ptr;
@@ -593,13 +595,10 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     col_index = 0;
 
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) +
-                                        dither0[col_index]]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) +
-                                        dither1[col_index]]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) +
-                                        dither2[col_index]]);
-      *output_ptr++ = (JSAMPLE) pixcode;
+      pixcode  = colorindex0[(*input_ptr++) + dither0[col_index]];
+      pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+      pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
+      *output_ptr++ = (JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
     row_index = (row_index + 1) & ODITHER_MASK;
@@ -609,11 +608,11 @@ quantize3_ord_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
 
 METHODDEF(void)
-quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                    JSAMPARRAY output_buf, int num_rows)
+quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                   JSAMPARRAY output_buf, int num_rows)
 /* General case, with Floyd-Steinberg dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register LOCFSERROR cur;      /* current error or pixel value */
   LOCFSERROR belowerr;          /* error for pixel below cur */
   LOCFSERROR bpreverr;          /* error for below/prev col */
@@ -637,17 +636,17 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
 
   for (row = 0; row < num_rows; row++) {
     /* Initialize output values to 0 so can process components separately */
-    jzero_far((void *) output_buf[row], (size_t) (width * sizeof(JSAMPLE)));
+    jzero_far((void *)output_buf[row], (size_t)(width * sizeof(JSAMPLE)));
     for (ci = 0; ci < nc; ci++) {
       input_ptr = input_buf[row] + ci;
       output_ptr = output_buf[row];
       if (cquantize->on_odd_row) {
         /* work right to left in this row */
-        input_ptr += (width-1) * nc; /* so point to rightmost pixel */
-        output_ptr += width-1;
+        input_ptr += (width - 1) * nc; /* so point to rightmost pixel */
+        output_ptr += width - 1;
         dir = -1;
         dirnc = -nc;
-        errorptr = cquantize->fserrors[ci] + (width+1); /* => entry after last column */
+        errorptr = cquantize->fserrors[ci] + (width + 1); /* => entry after last column */
       } else {
         /* work left to right in this row */
         dir = 1;
@@ -675,15 +674,15 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * The maximum error is +- MAXJSAMPLE; this sets the required size
          * of the range_limit array.
          */
-        cur += GETJSAMPLE(*input_ptr);
-        cur = GETJSAMPLE(range_limit[cur]);
+        cur += *input_ptr;
+        cur = range_limit[cur];
         /* Select output value, accumulate into output code for this pixel */
-        pixcode = GETJSAMPLE(colorindex_ci[cur]);
-        *output_ptr += (JSAMPLE) pixcode;
+        pixcode = colorindex_ci[cur];
+        *output_ptr += (JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
-        cur -= GETJSAMPLE(colormap_ci[pixcode]);
+        cur -= colormap_ci[pixcode];
         /* Compute error fractions to be propagated to adjacent pixels.
          * Add these into the running sums, and simultaneously shift the
          * next-line error sums left by 1 column.
@@ -691,7 +690,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
         bnexterr = cur;
         delta = cur * 2;
         cur += delta;           /* form error * 3 */
-        errorptr[0] = (FSERROR) (bpreverr + cur);
+        errorptr[0] = (FSERROR)(bpreverr + cur);
         cur += delta;           /* form error * 5 */
         bpreverr = belowerr + cur;
         belowerr = bnexterr;
@@ -708,7 +707,7 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
        * final fserrors[] entry.  Note we need not unload belowerr because
        * it is for the dummy column before or after the actual array.
        */
-      errorptr[0] = (FSERROR) bpreverr; /* unload prev err into array */
+      errorptr[0] = (FSERROR)bpreverr; /* unload prev err into array */
     }
     cquantize->on_odd_row = (cquantize->on_odd_row ? FALSE : TRUE);
   }
@@ -720,16 +719,16 @@ quantize_fs_dither (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
  */
 
 LOCAL(void)
-alloc_fs_workspace (j_decompress_ptr cinfo)
+alloc_fs_workspace(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   size_t arraysize;
   int i;
 
-  arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+  arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
   for (i = 0; i < cinfo->out_color_components; i++) {
     cquantize->fserrors[i] = (FSERRPTR)
-      (*cinfo->mem->alloc_large)((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
   }
 }
 
@@ -739,9 +738,9 @@ alloc_fs_workspace (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_1_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   size_t arraysize;
   int i;
 
@@ -767,7 +766,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
      * we must recreate the color index table with padding.
      * This will cost extra space, but probably isn't very likely.
      */
-    if (! cquantize->is_padded)
+    if (!cquantize->is_padded)
       create_colorindex(cinfo);
     /* Create ordered-dither tables if we didn't already. */
     if (cquantize->odither[0] == NULL)
@@ -780,9 +779,9 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
     if (cquantize->fserrors[0] == NULL)
       alloc_fs_workspace(cinfo);
     /* Initialize the propagated errors to zero. */
-    arraysize = (size_t) ((cinfo->output_width + 2) * sizeof(FSERROR));
+    arraysize = (size_t)((cinfo->output_width + 2) * sizeof(FSERROR));
     for (i = 0; i < cinfo->out_color_components; i++)
-      jzero_far((void *) cquantize->fserrors[i], arraysize);
+      jzero_far((void *)cquantize->fserrors[i], arraysize);
     break;
   default:
     ERREXIT(cinfo, JERR_NOT_COMPILED);
@@ -796,7 +795,7 @@ start_pass_1_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
  */
 
 METHODDEF(void)
-finish_pass_1_quant (j_decompress_ptr cinfo)
+finish_pass_1_quant(j_decompress_ptr cinfo)
 {
   /* no work in 1-pass case */
 }
@@ -808,7 +807,7 @@ finish_pass_1_quant (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-new_color_map_1_quant (j_decompress_ptr cinfo)
+new_color_map_1_quant(j_decompress_ptr cinfo)
 {
   ERREXIT(cinfo, JERR_MODE_CHANGE);
 }
@@ -819,14 +818,14 @@ new_color_map_1_quant (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_1pass_quantizer (j_decompress_ptr cinfo)
+jinit_1pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
 
   cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
   cquantize->pub.start_pass = start_pass_1_quant;
   cquantize->pub.finish_pass = finish_pass_1_quant;
   cquantize->pub.new_color_map = new_color_map_1_quant;
@@ -837,8 +836,8 @@ jinit_1pass_quantizer (j_decompress_ptr cinfo)
   if (cinfo->out_color_components > MAX_Q_COMPS)
     ERREXIT1(cinfo, JERR_QUANT_COMPONENTS, MAX_Q_COMPS);
   /* Make sure colormap indexes can be represented by JSAMPLEs */
-  if (cinfo->desired_number_of_colors > (MAXJSAMPLE+1))
-    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE+1);
+  if (cinfo->desired_number_of_colors > (MAXJSAMPLE + 1))
+    ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXJSAMPLE + 1);
 
   /* Create the colormap and color index table. */
   create_colormap(cinfo);
diff --git a/media/libjpeg/jquant2.c b/media/libjpeg/jquant2.c
index cfbd0f1526..44efb18cad 100644
--- a/media/libjpeg/jquant2.c
+++ b/media/libjpeg/jquant2.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009, 2014-2015, D. R. Commander.
+ * Copyright (C) 2009, 2014-2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -73,14 +73,14 @@
  * probably need to change these scale factors.
  */
 
-#define R_SCALE 2               /* scale R distances by this much */
-#define G_SCALE 3               /* scale G distances by this much */
-#define B_SCALE 1               /* and B by this much */
+#define R_SCALE  2              /* scale R distances by this much */
+#define G_SCALE  3              /* scale G distances by this much */
+#define B_SCALE  1              /* and B by this much */
 
-static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
-#define C0_SCALE c_scales[rgb_red[cinfo->out_color_space]]
-#define C1_SCALE c_scales[rgb_green[cinfo->out_color_space]]
-#define C2_SCALE c_scales[rgb_blue[cinfo->out_color_space]]
+static const int c_scales[3] = { R_SCALE, G_SCALE, B_SCALE };
+#define C0_SCALE  c_scales[rgb_red[cinfo->out_color_space]]
+#define C1_SCALE  c_scales[rgb_green[cinfo->out_color_space]]
+#define C2_SCALE  c_scales[rgb_blue[cinfo->out_color_space]]
 
 /*
  * First we have the histogram data structure and routines for creating it.
@@ -106,7 +106,7 @@ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
  * each 2-D array has 2^6*2^5 = 2048 or 2^6*2^6 = 4096 entries.
  */
 
-#define MAXNUMCOLORS  (MAXJSAMPLE+1) /* maximum size of colormap */
+#define MAXNUMCOLORS  (MAXJSAMPLE + 1) /* maximum size of colormap */
 
 /* These will do the right thing for either R,G,B or B,G,R color order,
  * but you may not like the results for other color orders.
@@ -116,19 +116,19 @@ static const int c_scales[3]={R_SCALE, G_SCALE, B_SCALE};
 #define HIST_C2_BITS  5         /* bits of precision in B/R histogram */
 
 /* Number of elements along histogram axes. */
-#define HIST_C0_ELEMS  (1<<HIST_C0_BITS)
-#define HIST_C1_ELEMS  (1<<HIST_C1_BITS)
-#define HIST_C2_ELEMS  (1<<HIST_C2_BITS)
+#define HIST_C0_ELEMS  (1 << HIST_C0_BITS)
+#define HIST_C1_ELEMS  (1 << HIST_C1_BITS)
+#define HIST_C2_ELEMS  (1 << HIST_C2_BITS)
 
 /* These are the amounts to shift an input value to get a histogram index. */
-#define C0_SHIFT  (BITS_IN_JSAMPLE-HIST_C0_BITS)
-#define C1_SHIFT  (BITS_IN_JSAMPLE-HIST_C1_BITS)
-#define C2_SHIFT  (BITS_IN_JSAMPLE-HIST_C2_BITS)
+#define C0_SHIFT  (BITS_IN_JSAMPLE - HIST_C0_BITS)
+#define C1_SHIFT  (BITS_IN_JSAMPLE - HIST_C1_BITS)
+#define C2_SHIFT  (BITS_IN_JSAMPLE - HIST_C2_BITS)
 
 
 typedef UINT16 histcell;        /* histogram cell; prefer an unsigned type */
 
-typedef histcell *histptr; /* for pointers to histogram cells */
+typedef histcell *histptr;      /* for pointers to histogram cells */
 
 typedef histcell hist1d[HIST_C2_ELEMS]; /* typedefs for the array */
 typedef hist1d *hist2d;         /* type for the 2nd-level pointers */
@@ -200,10 +200,10 @@ typedef my_cquantizer *my_cquantize_ptr;
  */
 
 METHODDEF(void)
-prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
-                  JSAMPARRAY output_buf, int num_rows)
+prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                 JSAMPARRAY output_buf, int num_rows)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   register JSAMPROW ptr;
   register histptr histp;
   register hist3d histogram = cquantize->histogram;
@@ -215,9 +215,9 @@ prescan_quantize (j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptr = input_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
-      histp = & histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-                         [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-                         [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+      histp = &histogram[ptr[0] >> C0_SHIFT]
+                        [ptr[1] >> C1_SHIFT]
+                        [ptr[2] >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
         (*histp)--;
@@ -249,7 +249,7 @@ typedef box *boxptr;
 
 
 LOCAL(boxptr)
-find_biggest_color_pop (boxptr boxlist, int numboxes)
+find_biggest_color_pop(boxptr boxlist, int numboxes)
 /* Find the splittable box with the largest color population */
 /* Returns NULL if no splittable boxes remain */
 {
@@ -269,7 +269,7 @@ find_biggest_color_pop (boxptr boxlist, int numboxes)
 
 
 LOCAL(boxptr)
-find_biggest_volume (boxptr boxlist, int numboxes)
+find_biggest_volume(boxptr boxlist, int numboxes)
 /* Find the splittable box with the largest (scaled) volume */
 /* Returns NULL if no splittable boxes remain */
 {
@@ -289,16 +289,16 @@ find_biggest_volume (boxptr boxlist, int numboxes)
 
 
 LOCAL(void)
-update_box (j_decompress_ptr cinfo, boxptr boxp)
+update_box(j_decompress_ptr cinfo, boxptr boxp)
 /* Shrink the min/max bounds of a box to enclose only nonzero elements, */
 /* and recompute its volume and population */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   histptr histp;
-  int c0,c1,c2;
-  int c0min,c0max,c1min,c1max,c2min,c2max;
-  JLONG dist0,dist1,dist2;
+  int c0, c1, c2;
+  int c0min, c0max, c1min, c1max, c2min, c2max;
+  JLONG dist0, dist1, dist2;
   long ccount;
 
   c0min = boxp->c0min;  c0max = boxp->c0max;
@@ -308,69 +308,69 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
   if (c0max > c0min)
     for (c0 = c0min; c0 <= c0max; c0++)
       for (c1 = c1min; c1 <= c1max; c1++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c0min = c0min = c0;
             goto have_c0min;
           }
       }
- have_c0min:
+have_c0min:
   if (c0max > c0min)
     for (c0 = c0max; c0 >= c0min; c0--)
       for (c1 = c1min; c1 <= c1max; c1++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c0max = c0max = c0;
             goto have_c0max;
           }
       }
- have_c0max:
+have_c0max:
   if (c1max > c1min)
     for (c1 = c1min; c1 <= c1max; c1++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c1min = c1min = c1;
             goto have_c1min;
           }
       }
- have_c1min:
+have_c1min:
   if (c1max > c1min)
     for (c1 = c1max; c1 >= c1min; c1--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1][c2min];
+        histp = &histogram[c0][c1][c2min];
         for (c2 = c2min; c2 <= c2max; c2++)
           if (*histp++ != 0) {
             boxp->c1max = c1max = c1;
             goto have_c1max;
           }
       }
- have_c1max:
+have_c1max:
   if (c2max > c2min)
     for (c2 = c2min; c2 <= c2max; c2++)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1min][c2];
+        histp = &histogram[c0][c1min][c2];
         for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
           if (*histp != 0) {
             boxp->c2min = c2min = c2;
             goto have_c2min;
           }
       }
- have_c2min:
+have_c2min:
   if (c2max > c2min)
     for (c2 = c2max; c2 >= c2min; c2--)
       for (c0 = c0min; c0 <= c0max; c0++) {
-        histp = & histogram[c0][c1min][c2];
+        histp = &histogram[c0][c1min][c2];
         for (c1 = c1min; c1 <= c1max; c1++, histp += HIST_C2_ELEMS)
           if (*histp != 0) {
             boxp->c2max = c2max = c2;
             goto have_c2max;
           }
       }
- have_c2max:
+have_c2max:
 
   /* Update box volume.
    * We use 2-norm rather than real volume here; this biases the method
@@ -383,13 +383,13 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
   dist0 = ((c0max - c0min) << C0_SHIFT) * C0_SCALE;
   dist1 = ((c1max - c1min) << C1_SHIFT) * C1_SCALE;
   dist2 = ((c2max - c2min) << C2_SHIFT) * C2_SCALE;
-  boxp->volume = dist0*dist0 + dist1*dist1 + dist2*dist2;
+  boxp->volume = dist0 * dist0 + dist1 * dist1 + dist2 * dist2;
 
   /* Now scan remaining volume of box and compute population */
   ccount = 0;
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
-      histp = & histogram[c0][c1][c2min];
+      histp = &histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++, histp++)
         if (*histp != 0) {
           ccount++;
@@ -400,19 +400,19 @@ update_box (j_decompress_ptr cinfo, boxptr boxp)
 
 
 LOCAL(int)
-median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
-            int desired_colors)
+median_cut(j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
+           int desired_colors)
 /* Repeatedly select and split the largest box until we have enough boxes */
 {
-  int n,lb;
-  int c0,c1,c2,cmax;
-  register boxptr b1,b2;
+  int n, lb;
+  int c0, c1, c2, cmax;
+  register boxptr b1, b2;
 
   while (numboxes < desired_colors) {
     /* Select box to split.
      * Current algorithm: by population for first half, then by volume.
      */
-    if (numboxes*2 <= desired_colors) {
+    if (numboxes * 2 <= desired_colors) {
       b1 = find_biggest_color_pop(boxlist, numboxes);
     } else {
       b1 = find_biggest_volume(boxlist, numboxes);
@@ -421,8 +421,8 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
       break;
     b2 = &boxlist[numboxes];    /* where new box will go */
     /* Copy the color bounds to the new box. */
-    b2->c0max = b1->c0max; b2->c1max = b1->c1max; b2->c2max = b1->c2max;
-    b2->c0min = b1->c0min; b2->c1min = b1->c1min; b2->c2min = b1->c2min;
+    b2->c0max = b1->c0max;  b2->c1max = b1->c1max;  b2->c2max = b1->c2max;
+    b2->c0min = b1->c0min;  b2->c1min = b1->c1min;  b2->c2min = b1->c2min;
     /* Choose which axis to split the box on.
      * Current algorithm: longest scaled axis.
      * See notes in update_box about scaling distances.
@@ -434,13 +434,12 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
      * This code does the right thing for R,G,B or B,G,R color orders only.
      */
     if (rgb_red[cinfo->out_color_space] == 0) {
-      cmax = c1; n = 1;
-      if (c0 > cmax) { cmax = c0; n = 0; }
+      cmax = c1;  n = 1;
+      if (c0 > cmax) { cmax = c0;  n = 0; }
       if (c2 > cmax) { n = 2; }
-    }
-    else {
-      cmax = c1; n = 1;
-      if (c2 > cmax) { cmax = c2; n = 2; }
+    } else {
+      cmax = c1;  n = 1;
+      if (c2 > cmax) { cmax = c2;  n = 2; }
       if (c0 > cmax) { n = 0; }
     }
     /* Choose split point along selected axis, and update box bounds.
@@ -453,17 +452,17 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
     case 0:
       lb = (b1->c0max + b1->c0min) / 2;
       b1->c0max = lb;
-      b2->c0min = lb+1;
+      b2->c0min = lb + 1;
       break;
     case 1:
       lb = (b1->c1max + b1->c1min) / 2;
       b1->c1max = lb;
-      b2->c1min = lb+1;
+      b2->c1min = lb + 1;
       break;
     case 2:
       lb = (b1->c2max + b1->c2min) / 2;
       b1->c2max = lb;
-      b2->c2min = lb+1;
+      b2->c2min = lb + 1;
       break;
     }
     /* Update stats for boxes */
@@ -476,16 +475,16 @@ median_cut (j_decompress_ptr cinfo, boxptr boxlist, int numboxes,
 
 
 LOCAL(void)
-compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor)
+compute_color(j_decompress_ptr cinfo, boxptr boxp, int icolor)
 /* Compute representative color for a box, put it in colormap[icolor] */
 {
   /* Current algorithm: mean weighted by pixels (not colors) */
   /* Note it is important to get the rounding correct! */
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   histptr histp;
-  int c0,c1,c2;
-  int c0min,c0max,c1min,c1max,c2min,c2max;
+  int c0, c1, c2;
+  int c0min, c0max, c1min, c1max, c2min, c2max;
   long count;
   long total = 0;
   long c0total = 0;
@@ -498,25 +497,25 @@ compute_color (j_decompress_ptr cinfo, boxptr boxp, int icolor)
 
   for (c0 = c0min; c0 <= c0max; c0++)
     for (c1 = c1min; c1 <= c1max; c1++) {
-      histp = & histogram[c0][c1][c2min];
+      histp = &histogram[c0][c1][c2min];
       for (c2 = c2min; c2 <= c2max; c2++) {
         if ((count = *histp++) != 0) {
           total += count;
-          c0total += ((c0 << C0_SHIFT) + ((1<<C0_SHIFT)>>1)) * count;
-          c1total += ((c1 << C1_SHIFT) + ((1<<C1_SHIFT)>>1)) * count;
-          c2total += ((c2 << C2_SHIFT) + ((1<<C2_SHIFT)>>1)) * count;
+          c0total += ((c0 << C0_SHIFT) + ((1 << C0_SHIFT) >> 1)) * count;
+          c1total += ((c1 << C1_SHIFT) + ((1 << C1_SHIFT) >> 1)) * count;
+          c2total += ((c2 << C2_SHIFT) + ((1 << C2_SHIFT) >> 1)) * count;
         }
       }
     }
 
-  cinfo->colormap[0][icolor] = (JSAMPLE) ((c0total + (total>>1)) / total);
-  cinfo->colormap[1][icolor] = (JSAMPLE) ((c1total + (total>>1)) / total);
-  cinfo->colormap[2][icolor] = (JSAMPLE) ((c2total + (total>>1)) / total);
+  cinfo->colormap[0][icolor] = (JSAMPLE)((c0total + (total >> 1)) / total);
+  cinfo->colormap[1][icolor] = (JSAMPLE)((c1total + (total >> 1)) / total);
+  cinfo->colormap[2][icolor] = (JSAMPLE)((c2total + (total >> 1)) / total);
 }
 
 
 LOCAL(void)
-select_colors (j_decompress_ptr cinfo, int desired_colors)
+select_colors(j_decompress_ptr cinfo, int desired_colors)
 /* Master routine for color selection */
 {
   boxptr boxlist;
@@ -524,8 +523,8 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
   int i;
 
   /* Allocate workspace for box list */
-  boxlist = (boxptr) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
+  boxlist = (boxptr)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, desired_colors * sizeof(box));
   /* Initialize one box containing whole space */
   numboxes = 1;
   boxlist[0].c0min = 0;
@@ -535,12 +534,12 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
   boxlist[0].c2min = 0;
   boxlist[0].c2max = MAXJSAMPLE >> C2_SHIFT;
   /* Shrink it to actually-used volume and set its statistics */
-  update_box(cinfo, & boxlist[0]);
+  update_box(cinfo, &boxlist[0]);
   /* Perform median-cut to produce final box list */
   numboxes = median_cut(cinfo, boxlist, numboxes, desired_colors);
   /* Compute the representative color for each box, fill colormap */
   for (i = 0; i < numboxes; i++)
-    compute_color(cinfo, & boxlist[i], i);
+    compute_color(cinfo, &boxlist[i], i);
   cinfo->actual_number_of_colors = numboxes;
   TRACEMS1(cinfo, 1, JTRC_QUANT_SELECTED, numboxes);
 }
@@ -601,13 +600,13 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
 
 
 /* log2(histogram cells in update box) for each axis; this can be adjusted */
-#define BOX_C0_LOG  (HIST_C0_BITS-3)
-#define BOX_C1_LOG  (HIST_C1_BITS-3)
-#define BOX_C2_LOG  (HIST_C2_BITS-3)
+#define BOX_C0_LOG  (HIST_C0_BITS - 3)
+#define BOX_C1_LOG  (HIST_C1_BITS - 3)
+#define BOX_C2_LOG  (HIST_C2_BITS - 3)
 
-#define BOX_C0_ELEMS  (1<<BOX_C0_LOG) /* # of hist cells in update box */
-#define BOX_C1_ELEMS  (1<<BOX_C1_LOG)
-#define BOX_C2_ELEMS  (1<<BOX_C2_LOG)
+#define BOX_C0_ELEMS  (1 << BOX_C0_LOG) /* # of hist cells in update box */
+#define BOX_C1_ELEMS  (1 << BOX_C1_LOG)
+#define BOX_C2_ELEMS  (1 << BOX_C2_LOG)
 
 #define BOX_C0_SHIFT  (C0_SHIFT + BOX_C0_LOG)
 #define BOX_C1_SHIFT  (C1_SHIFT + BOX_C1_LOG)
@@ -623,8 +622,8 @@ select_colors (j_decompress_ptr cinfo, int desired_colors)
  */
 
 LOCAL(int)
-find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                    JSAMPLE colorlist[])
+find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+                   JSAMPLE colorlist[])
 /* Locate the colormap entries close enough to an update box to be candidates
  * for the nearest entry to some cell(s) in the update box.  The update box
  * is specified by the center coordinates of its first cell.  The number of
@@ -666,70 +665,70 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
   for (i = 0; i < numcolors; i++) {
     /* We compute the squared-c0-distance term, then add in the other two. */
-    x = GETJSAMPLE(cinfo->colormap[0][i]);
+    x = cinfo->colormap[0][i];
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
-      min_dist = tdist*tdist;
+      min_dist = tdist * tdist;
       tdist = (x - maxc0) * C0_SCALE;
-      max_dist = tdist*tdist;
+      max_dist = tdist * tdist;
     } else if (x > maxc0) {
       tdist = (x - maxc0) * C0_SCALE;
-      min_dist = tdist*tdist;
+      min_dist = tdist * tdist;
       tdist = (x - minc0) * C0_SCALE;
-      max_dist = tdist*tdist;
+      max_dist = tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       min_dist = 0;
       if (x <= centerc0) {
         tdist = (x - maxc0) * C0_SCALE;
-        max_dist = tdist*tdist;
+        max_dist = tdist * tdist;
       } else {
         tdist = (x - minc0) * C0_SCALE;
-        max_dist = tdist*tdist;
+        max_dist = tdist * tdist;
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[1][i]);
+    x = cinfo->colormap[1][i];
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - maxc1) * C1_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else if (x > maxc1) {
       tdist = (x - maxc1) * C1_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - minc1) * C1_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc1) {
         tdist = (x - maxc1) * C1_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       } else {
         tdist = (x - minc1) * C1_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[2][i]);
+    x = cinfo->colormap[2][i];
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - maxc2) * C2_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else if (x > maxc2) {
       tdist = (x - maxc2) * C2_SCALE;
-      min_dist += tdist*tdist;
+      min_dist += tdist * tdist;
       tdist = (x - minc2) * C2_SCALE;
-      max_dist += tdist*tdist;
+      max_dist += tdist * tdist;
     } else {
       /* within cell range so no contribution to min_dist */
       if (x <= centerc2) {
         tdist = (x - maxc2) * C2_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       } else {
         tdist = (x - minc2) * C2_SCALE;
-        max_dist += tdist*tdist;
+        max_dist += tdist * tdist;
       }
     }
 
@@ -745,15 +744,15 @@ find_nearby_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
   ncolors = 0;
   for (i = 0; i < numcolors; i++) {
     if (mindist[i] <= minmaxdist)
-      colorlist[ncolors++] = (JSAMPLE) i;
+      colorlist[ncolors++] = (JSAMPLE)i;
   }
   return ncolors;
 }
 
 
 LOCAL(void)
-find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
-                  int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
+find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
+                 int numcolors, JSAMPLE colorlist[], JSAMPLE bestcolor[])
 /* Find the closest colormap entry for each cell in the update box,
  * given the list of candidate colors prepared by find_nearby_colors.
  * Return the indexes of the closest entries in the bestcolor[] array.
@@ -775,7 +774,7 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
   /* Initialize best-distance for each cell of the update box */
   bptr = bestdist;
-  for (i = BOX_C0_ELEMS*BOX_C1_ELEMS*BOX_C2_ELEMS-1; i >= 0; i--)
+  for (i = BOX_C0_ELEMS * BOX_C1_ELEMS * BOX_C2_ELEMS - 1; i >= 0; i--)
     *bptr++ = 0x7FFFFFFFL;
 
   /* For each color selected by find_nearby_colors,
@@ -789,14 +788,14 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 #define STEP_C2  ((1 << C2_SHIFT) * C2_SCALE)
 
   for (i = 0; i < numcolors; i++) {
-    icolor = GETJSAMPLE(colorlist[i]);
+    icolor = colorlist[i];
     /* Compute (square of) distance from minc0/c1/c2 to this color */
-    inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
-    dist0 = inc0*inc0;
-    inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
-    dist0 += inc1*inc1;
-    inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
-    dist0 += inc2*inc2;
+    inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
+    dist0 = inc0 * inc0;
+    inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
+    dist0 += inc1 * inc1;
+    inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
+    dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
     inc1 = inc1 * (2 * STEP_C1) + STEP_C1 * STEP_C1;
@@ -805,16 +804,16 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
     bptr = bestdist;
     cptr = bestcolor;
     xx0 = inc0;
-    for (ic0 = BOX_C0_ELEMS-1; ic0 >= 0; ic0--) {
+    for (ic0 = BOX_C0_ELEMS - 1; ic0 >= 0; ic0--) {
       dist1 = dist0;
       xx1 = inc1;
-      for (ic1 = BOX_C1_ELEMS-1; ic1 >= 0; ic1--) {
+      for (ic1 = BOX_C1_ELEMS - 1; ic1 >= 0; ic1--) {
         dist2 = dist1;
         xx2 = inc2;
-        for (ic2 = BOX_C2_ELEMS-1; ic2 >= 0; ic2--) {
+        for (ic2 = BOX_C2_ELEMS - 1; ic2 >= 0; ic2--) {
           if (dist2 < *bptr) {
             *bptr = dist2;
-            *cptr = (JSAMPLE) icolor;
+            *cptr = (JSAMPLE)icolor;
           }
           dist2 += xx2;
           xx2 += 2 * STEP_C2 * STEP_C2;
@@ -832,12 +831,12 @@ find_best_colors (j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
 
 LOCAL(void)
-fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
+fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
 /* Fill the inverse-colormap entries in the update box that contains */
 /* histogram cell c0/c1/c2.  (Only that one cell MUST be filled, but */
 /* we can fill as many others as we wish.) */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   int minc0, minc1, minc2;      /* lower left corner of update box */
   int ic0, ic1, ic2;
@@ -878,9 +877,9 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
   cptr = bestcolor;
   for (ic0 = 0; ic0 < BOX_C0_ELEMS; ic0++) {
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
-      cachep = & histogram[c0+ic0][c1+ic1][c2];
+      cachep = &histogram[c0 + ic0][c1 + ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-        *cachep++ = (histcell) (GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell)((*cptr++) + 1);
       }
     }
   }
@@ -892,11 +891,11 @@ fill_inverse_cmap (j_decompress_ptr cinfo, int c0, int c1, int c2)
  */
 
 METHODDEF(void)
-pass2_no_dither (j_decompress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* This version performs no dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   register JSAMPROW inptr, outptr;
   register histptr cachep;
@@ -910,27 +909,27 @@ pass2_no_dither (j_decompress_ptr cinfo,
     outptr = output_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the cache */
-      c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
-      c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
-      c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
-      cachep = & histogram[c0][c1][c2];
+      c0 = (*inptr++) >> C0_SHIFT;
+      c1 = (*inptr++) >> C1_SHIFT;
+      c2 = (*inptr++) >> C2_SHIFT;
+      cachep = &histogram[c0][c1][c2];
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
       if (*cachep == 0)
-        fill_inverse_cmap(cinfo, c0,c1,c2);
+        fill_inverse_cmap(cinfo, c0, c1, c2);
       /* Now emit the colormap index for this cell */
-      *outptr++ = (JSAMPLE) (*cachep - 1);
+      *outptr++ = (JSAMPLE)(*cachep - 1);
     }
   }
 }
 
 
 METHODDEF(void)
-pass2_fs_dither (j_decompress_ptr cinfo,
-                 JSAMPARRAY input_buf, JSAMPARRAY output_buf, int num_rows)
+pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
+                JSAMPARRAY output_buf, int num_rows)
 /* This version performs Floyd-Steinberg dithering */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   register LOCFSERROR cur0, cur1, cur2; /* current error or pixel value */
   LOCFSERROR belowerr0, belowerr1, belowerr2; /* error for pixel below cur */
@@ -956,11 +955,11 @@ pass2_fs_dither (j_decompress_ptr cinfo,
     outptr = output_buf[row];
     if (cquantize->on_odd_row) {
       /* work right to left in this row */
-      inptr += (width-1) * 3;   /* so point to rightmost pixel */
-      outptr += width-1;
+      inptr += (width - 1) * 3; /* so point to rightmost pixel */
+      outptr += width - 1;
       dir = -1;
       dir3 = -3;
-      errorptr = cquantize->fserrors + (width+1)*3; /* => entry after last column */
+      errorptr = cquantize->fserrors + (width + 1) * 3; /* => entry after last column */
       cquantize->on_odd_row = FALSE; /* flip for next time */
     } else {
       /* work left to right in this row */
@@ -984,9 +983,9 @@ pass2_fs_dither (j_decompress_ptr cinfo,
        * for either sign of the error value.
        * Note: errorptr points to *previous* column's array entry.
        */
-      cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3+0] + 8, 4);
-      cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3+1] + 8, 4);
-      cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3+2] + 8, 4);
+      cur0 = RIGHT_SHIFT(cur0 + errorptr[dir3 + 0] + 8, 4);
+      cur1 = RIGHT_SHIFT(cur1 + errorptr[dir3 + 1] + 8, 4);
+      cur2 = RIGHT_SHIFT(cur2 + errorptr[dir3 + 2] + 8, 4);
       /* Limit the error using transfer function set by init_error_limit.
        * See comments with init_error_limit for rationale.
        */
@@ -997,44 +996,48 @@ pass2_fs_dither (j_decompress_ptr cinfo,
        * The maximum error is +- MAXJSAMPLE (or less with error limiting);
        * this sets the required size of the range_limit array.
        */
-      cur0 += GETJSAMPLE(inptr[0]);
-      cur1 += GETJSAMPLE(inptr[1]);
-      cur2 += GETJSAMPLE(inptr[2]);
-      cur0 = GETJSAMPLE(range_limit[cur0]);
-      cur1 = GETJSAMPLE(range_limit[cur1]);
-      cur2 = GETJSAMPLE(range_limit[cur2]);
+      cur0 += inptr[0];
+      cur1 += inptr[1];
+      cur2 += inptr[2];
+      cur0 = range_limit[cur0];
+      cur1 = range_limit[cur1];
+      cur2 = range_limit[cur2];
       /* Index into the cache with adjusted pixel value */
-      cachep = & histogram[cur0>>C0_SHIFT][cur1>>C1_SHIFT][cur2>>C2_SHIFT];
+      cachep =
+        &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
       /* If we have not seen this color before, find nearest colormap */
       /* entry and update the cache */
       if (*cachep == 0)
-        fill_inverse_cmap(cinfo, cur0>>C0_SHIFT,cur1>>C1_SHIFT,cur2>>C2_SHIFT);
+        fill_inverse_cmap(cinfo, cur0 >> C0_SHIFT, cur1 >> C1_SHIFT,
+                          cur2 >> C2_SHIFT);
       /* Now emit the colormap index for this cell */
-      { register int pixcode = *cachep - 1;
-        *outptr = (JSAMPLE) pixcode;
+      {
+        register int pixcode = *cachep - 1;
+        *outptr = (JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
-        cur0 -= GETJSAMPLE(colormap0[pixcode]);
-        cur1 -= GETJSAMPLE(colormap1[pixcode]);
-        cur2 -= GETJSAMPLE(colormap2[pixcode]);
+        cur0 -= colormap0[pixcode];
+        cur1 -= colormap1[pixcode];
+        cur2 -= colormap2[pixcode];
       }
       /* Compute error fractions to be propagated to adjacent pixels.
        * Add these into the running sums, and simultaneously shift the
        * next-line error sums left by 1 column.
        */
-      { register LOCFSERROR bnexterr;
+      {
+        register LOCFSERROR bnexterr;
 
         bnexterr = cur0;        /* Process component 0 */
-        errorptr[0] = (FSERROR) (bpreverr0 + cur0 * 3);
+        errorptr[0] = (FSERROR)(bpreverr0 + cur0 * 3);
         bpreverr0 = belowerr0 + cur0 * 5;
         belowerr0 = bnexterr;
         cur0 *= 7;
         bnexterr = cur1;        /* Process component 1 */
-        errorptr[1] = (FSERROR) (bpreverr1 + cur1 * 3);
+        errorptr[1] = (FSERROR)(bpreverr1 + cur1 * 3);
         bpreverr1 = belowerr1 + cur1 * 5;
         belowerr1 = bnexterr;
         cur1 *= 7;
         bnexterr = cur2;        /* Process component 2 */
-        errorptr[2] = (FSERROR) (bpreverr2 + cur2 * 3);
+        errorptr[2] = (FSERROR)(bpreverr2 + cur2 * 3);
         bpreverr2 = belowerr2 + cur2 * 5;
         belowerr2 = bnexterr;
         cur2 *= 7;
@@ -1051,9 +1054,9 @@ pass2_fs_dither (j_decompress_ptr cinfo,
      * final fserrors[] entry.  Note we need not unload belowerrN because
      * it is for the dummy column before or after the actual array.
      */
-    errorptr[0] = (FSERROR) bpreverr0; /* unload prev errs into array */
-    errorptr[1] = (FSERROR) bpreverr1;
-    errorptr[2] = (FSERROR) bpreverr2;
+    errorptr[0] = (FSERROR)bpreverr0; /* unload prev errs into array */
+    errorptr[1] = (FSERROR)bpreverr1;
+    errorptr[2] = (FSERROR)bpreverr2;
   }
 }
 
@@ -1076,31 +1079,31 @@ pass2_fs_dither (j_decompress_ptr cinfo,
  */
 
 LOCAL(void)
-init_error_limit (j_decompress_ptr cinfo)
+init_error_limit(j_decompress_ptr cinfo)
 /* Allocate and fill in the error_limiter table */
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   int *table;
   int in, out;
 
-  table = (int *) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, (MAXJSAMPLE*2+1) * sizeof(int));
+  table = (int *)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (MAXJSAMPLE * 2 + 1) * sizeof(int));
   table += MAXJSAMPLE;          /* so can index -MAXJSAMPLE .. +MAXJSAMPLE */
   cquantize->error_limiter = table;
 
-#define STEPSIZE ((MAXJSAMPLE+1)/16)
+#define STEPSIZE  ((MAXJSAMPLE + 1) / 16)
   /* Map errors 1:1 up to +- MAXJSAMPLE/16 */
   out = 0;
   for (in = 0; in < STEPSIZE; in++, out++) {
-    table[in] = out; table[-in] = -out;
+    table[in] = out;  table[-in] = -out;
   }
   /* Map errors 1:2 up to +- 3*MAXJSAMPLE/16 */
-  for (; in < STEPSIZE*3; in++, out += (in&1) ? 0 : 1) {
-    table[in] = out; table[-in] = -out;
+  for (; in < STEPSIZE * 3; in++, out += (in & 1) ? 0 : 1) {
+    table[in] = out;  table[-in] = -out;
   }
   /* Clamp the rest to final out value (which is (MAXJSAMPLE+1)/8) */
   for (; in <= MAXJSAMPLE; in++) {
-    table[in] = out; table[-in] = -out;
+    table[in] = out;  table[-in] = -out;
   }
 #undef STEPSIZE
 }
@@ -1111,9 +1114,9 @@ init_error_limit (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-finish_pass1 (j_decompress_ptr cinfo)
+finish_pass1(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
 
   /* Select the representative colors and fill in cinfo->colormap */
   cinfo->colormap = cquantize->sv_colormap;
@@ -1124,7 +1127,7 @@ finish_pass1 (j_decompress_ptr cinfo)
 
 
 METHODDEF(void)
-finish_pass2 (j_decompress_ptr cinfo)
+finish_pass2(j_decompress_ptr cinfo)
 {
   /* no work */
 }
@@ -1135,14 +1138,14 @@ finish_pass2 (j_decompress_ptr cinfo)
  */
 
 METHODDEF(void)
-start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
+start_pass_2_quant(j_decompress_ptr cinfo, boolean is_pre_scan)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
   hist3d histogram = cquantize->histogram;
   int i;
 
   /* Only F-S dithering or no dithering is supported. */
-  /* If user asks for ordered dither, give him F-S. */
+  /* If user asks for ordered dither, give them F-S. */
   if (cinfo->dither_mode != JDITHER_NONE)
     cinfo->dither_mode = JDITHER_FS;
 
@@ -1167,14 +1170,14 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
 
     if (cinfo->dither_mode == JDITHER_FS) {
-      size_t arraysize = (size_t) ((cinfo->output_width + 2) *
-                                   (3 * sizeof(FSERROR)));
+      size_t arraysize =
+        (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR)));
       /* Allocate Floyd-Steinberg workspace if we didn't already. */
       if (cquantize->fserrors == NULL)
-        cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
-          ((j_common_ptr) cinfo, JPOOL_IMAGE, arraysize);
+        cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+          ((j_common_ptr)cinfo, JPOOL_IMAGE, arraysize);
       /* Initialize the propagated errors to zero. */
-      jzero_far((void *) cquantize->fserrors, arraysize);
+      jzero_far((void *)cquantize->fserrors, arraysize);
       /* Make the error-limit table if we didn't already. */
       if (cquantize->error_limiter == NULL)
         init_error_limit(cinfo);
@@ -1185,8 +1188,8 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
   /* Zero the histogram or inverse color map, if necessary */
   if (cquantize->needs_zeroed) {
     for (i = 0; i < HIST_C0_ELEMS; i++) {
-      jzero_far((void *) histogram[i],
-                HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+      jzero_far((void *)histogram[i],
+                HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
     }
     cquantize->needs_zeroed = FALSE;
   }
@@ -1198,9 +1201,9 @@ start_pass_2_quant (j_decompress_ptr cinfo, boolean is_pre_scan)
  */
 
 METHODDEF(void)
-new_color_map_2_quant (j_decompress_ptr cinfo)
+new_color_map_2_quant(j_decompress_ptr cinfo)
 {
-  my_cquantize_ptr cquantize = (my_cquantize_ptr) cinfo->cquantize;
+  my_cquantize_ptr cquantize = (my_cquantize_ptr)cinfo->cquantize;
 
   /* Reset the inverse color map */
   cquantize->needs_zeroed = TRUE;
@@ -1212,15 +1215,15 @@ new_color_map_2_quant (j_decompress_ptr cinfo)
  */
 
 GLOBAL(void)
-jinit_2pass_quantizer (j_decompress_ptr cinfo)
+jinit_2pass_quantizer(j_decompress_ptr cinfo)
 {
   my_cquantize_ptr cquantize;
   int i;
 
   cquantize = (my_cquantize_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                 sizeof(my_cquantizer));
-  cinfo->cquantize = (struct jpeg_color_quantizer *) cquantize;
+  cinfo->cquantize = (struct jpeg_color_quantizer *)cquantize;
   cquantize->pub.start_pass = start_pass_2_quant;
   cquantize->pub.new_color_map = new_color_map_2_quant;
   cquantize->fserrors = NULL;   /* flag optional arrays not allocated */
@@ -1231,12 +1234,12 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
     ERREXIT(cinfo, JERR_NOTIMPL);
 
   /* Allocate the histogram/inverse colormap storage */
-  cquantize->histogram = (hist3d) (*cinfo->mem->alloc_small)
-    ((j_common_ptr) cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
+  cquantize->histogram = (hist3d)(*cinfo->mem->alloc_small)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, HIST_C0_ELEMS * sizeof(hist2d));
   for (i = 0; i < HIST_C0_ELEMS; i++) {
-    cquantize->histogram[i] = (hist2d) (*cinfo->mem->alloc_large)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       HIST_C1_ELEMS*HIST_C2_ELEMS * sizeof(histcell));
+    cquantize->histogram[i] = (hist2d)(*cinfo->mem->alloc_large)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       HIST_C1_ELEMS * HIST_C2_ELEMS * sizeof(histcell));
   }
   cquantize->needs_zeroed = TRUE; /* histogram is garbage now */
 
@@ -1254,13 +1257,13 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
     if (desired > MAXNUMCOLORS)
       ERREXIT1(cinfo, JERR_QUANT_MANY_COLORS, MAXNUMCOLORS);
     cquantize->sv_colormap = (*cinfo->mem->alloc_sarray)
-      ((j_common_ptr) cinfo,JPOOL_IMAGE, (JDIMENSION) desired, (JDIMENSION) 3);
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)desired, (JDIMENSION)3);
     cquantize->desired = desired;
   } else
     cquantize->sv_colormap = NULL;
 
   /* Only F-S dithering or no dithering is supported. */
-  /* If user asks for ordered dither, give him F-S. */
+  /* If user asks for ordered dither, give them F-S. */
   if (cinfo->dither_mode != JDITHER_NONE)
     cinfo->dither_mode = JDITHER_FS;
 
@@ -1271,9 +1274,9 @@ jinit_2pass_quantizer (j_decompress_ptr cinfo)
    * dither_mode changes.
    */
   if (cinfo->dither_mode == JDITHER_FS) {
-    cquantize->fserrors = (FSERRPTR) (*cinfo->mem->alloc_large)
-      ((j_common_ptr) cinfo, JPOOL_IMAGE,
-       (size_t) ((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
+    cquantize->fserrors = (FSERRPTR)(*cinfo->mem->alloc_large)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE,
+       (size_t)((cinfo->output_width + 2) * (3 * sizeof(FSERROR))));
     /* Might as well create the error-limiting table too. */
     init_error_limit(cinfo);
   }
diff --git a/media/libjpeg/jsimd.h b/media/libjpeg/jsimd.h
index 3aa0779b8a..6c203655ef 100644
--- a/media/libjpeg/jsimd.h
+++ b/media/libjpeg/jsimd.h
@@ -3,7 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,81 +14,110 @@
 
 #include "jchuff.h"             /* Declarations shared with jcphuff.c */
 
-EXTERN(int) jsimd_can_rgb_ycc (void);
-EXTERN(int) jsimd_can_rgb_gray (void);
-EXTERN(int) jsimd_can_ycc_rgb (void);
-EXTERN(int) jsimd_can_ycc_rgb565 (void);
-EXTERN(int) jsimd_c_can_null_convert (void);
+EXTERN(int) jsimd_can_rgb_ycc(void);
+EXTERN(int) jsimd_can_rgb_gray(void);
+EXTERN(int) jsimd_can_ycc_rgb(void);
+EXTERN(int) jsimd_can_ycc_rgb565(void);
+EXTERN(int) jsimd_c_can_null_convert(void);
 
-EXTERN(void) jsimd_rgb_ycc_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_rgb_gray_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_ycc_rgb_convert
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_rgb565_convert
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_c_null_convert
-        (j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_ycc_rgb_convert(j_decompress_ptr cinfo,
+                                   JSAMPIMAGE input_buf, JDIMENSION input_row,
+                                   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo,
+                                      JSAMPIMAGE input_buf,
+                                      JDIMENSION input_row,
+                                      JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                                  JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                  int num_rows);
 
-EXTERN(int) jsimd_can_h2v2_downsample (void);
-EXTERN(int) jsimd_can_h2v1_downsample (void);
+EXTERN(int) jsimd_can_h2v2_downsample(void);
+EXTERN(int) jsimd_can_h2v1_downsample(void);
 
-EXTERN(void) jsimd_h2v2_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample(j_compress_ptr cinfo,
+                                   jpeg_component_info *compptr,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data);
 
-EXTERN(int) jsimd_can_h2v2_smooth_downsample (void);
+EXTERN(int) jsimd_can_h2v2_smooth_downsample(void);
 
-EXTERN(void) jsimd_h2v2_smooth_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                                          jpeg_component_info *compptr,
+                                          JSAMPARRAY input_data,
+                                          JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v1_downsample
-        (j_compress_ptr cinfo, jpeg_component_info *compptr,
-        JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v1_downsample(j_compress_ptr cinfo,
+                                   jpeg_component_info *compptr,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data);
 
-EXTERN(int) jsimd_can_h2v2_upsample (void);
-EXTERN(int) jsimd_can_h2v1_upsample (void);
-EXTERN(int) jsimd_can_int_upsample (void);
+EXTERN(int) jsimd_can_h2v2_upsample(void);
+EXTERN(int) jsimd_can_h2v1_upsample(void);
+EXTERN(int) jsimd_can_int_upsample(void);
 
-EXTERN(void) jsimd_h2v2_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_int_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample(j_decompress_ptr cinfo,
+                                 jpeg_component_info *compptr,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample(j_decompress_ptr cinfo,
+                                 jpeg_component_info *compptr,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
+                                jpeg_component_info *compptr,
+                                JSAMPARRAY input_data,
+                                JSAMPARRAY *output_data_ptr);
 
-EXTERN(int) jsimd_can_h2v2_fancy_upsample (void);
-EXTERN(int) jsimd_can_h2v1_fancy_upsample (void);
+EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
+EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
 
-EXTERN(void) jsimd_h2v2_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v1_fancy_upsample
-        (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
 
-EXTERN(int) jsimd_can_h2v2_merged_upsample (void);
-EXTERN(int) jsimd_can_h2v1_merged_upsample (void);
+EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
+EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
 
-EXTERN(void) jsimd_h2v2_merged_upsample
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
-EXTERN(void) jsimd_h2v1_merged_upsample
-        (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf);
 
-EXTERN(int) jsimd_can_huff_encode_one_block (void);
+EXTERN(int) jsimd_can_huff_encode_one_block(void);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block(void *state, JOCTET *buffer,
+                                             JCOEFPTR block, int last_dc_val,
+                                             c_derived_tbl *dctbl,
+                                             c_derived_tbl *actbl);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_first_prepare(void);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_can_encode_mcu_AC_refine_prepare(void);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/jsimd_none.c b/media/libjpeg/jsimd_none.c
index f29030cfa7..5b38a9fb5c 100644
--- a/media/libjpeg/jsimd_none.c
+++ b/media/libjpeg/jsimd_none.c
@@ -3,7 +3,8 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -20,385 +21,411 @@
 #include "jsimddct.h"
 
 GLOBAL(int)
-jsimd_can_rgb_ycc (void)
+jsimd_can_rgb_ycc(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_rgb_gray (void)
+jsimd_can_rgb_gray(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_ycc_rgb (void)
+jsimd_can_ycc_rgb(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
+jsimd_can_ycc_rgb565(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_c_can_null_convert (void)
+jsimd_c_can_null_convert(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
 {
 }
 
 GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
-                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                      JDIMENSION output_row, int num_rows)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
+jsimd_can_h2v2_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
+jsimd_can_h2v1_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
+jsimd_can_h2v2_smooth_downsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
-                              jpeg_component_info *compptr,
-                              JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
+jsimd_can_h2v2_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
+jsimd_can_h2v1_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_int_upsample (void)
+jsimd_can_int_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                      JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
+jsimd_can_h2v2_fancy_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
+jsimd_can_h2v1_fancy_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
+jsimd_can_h1v2_fancy_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
 }
 
 GLOBAL(int)
-jsimd_can_convsamp (void)
+jsimd_can_h2v2_merged_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_convsamp_float (void)
+jsimd_can_h2v1_merged_upsample(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
 }
 
 GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
 }
 
 GLOBAL(int)
-jsimd_can_fdct_islow (void)
+jsimd_can_convsamp(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_fdct_ifast (void)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
+jsimd_can_fdct_ifast(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_quantize_float (void)
+jsimd_can_fdct_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
+jsimd_fdct_islow(DCTELEM *data)
 {
 }
 
 GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
 {
 }
 
 GLOBAL(int)
-jsimd_can_idct_2x2 (void)
+jsimd_can_quantize(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_4x4 (void)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_6x6 (void)
+jsimd_can_idct_4x4(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_12x12 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
+jsimd_can_idct_6x6(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_ifast (void)
+jsimd_can_idct_12x12(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
 {
   return 0;
 }
 
 GLOBAL(int)
-jsimd_can_idct_float (void)
+jsimd_can_idct_ifast(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
 {
   return 0;
 }
 
 GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
 {
 }
 
 GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
+jsimd_can_huff_encode_one_block(void)
 {
   return 0;
 }
 
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
 {
   return NULL;
 }
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/jsimddct.h b/media/libjpeg/jsimddct.h
index b19ab48d40..55ee8cf67f 100644
--- a/media/libjpeg/jsimddct.h
+++ b/media/libjpeg/jsimddct.h
@@ -9,66 +9,62 @@
  *
  */
 
-EXTERN(int) jsimd_can_convsamp (void);
-EXTERN(int) jsimd_can_convsamp_float (void);
+EXTERN(int) jsimd_can_convsamp(void);
+EXTERN(int) jsimd_can_convsamp_float(void);
 
-EXTERN(void) jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                             DCTELEM *workspace);
-EXTERN(void) jsimd_convsamp_float (JSAMPARRAY sample_data,
-                                   JDIMENSION start_col,
-                                   FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                                  FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_fdct_islow (void);
-EXTERN(int) jsimd_can_fdct_ifast (void);
-EXTERN(int) jsimd_can_fdct_float (void);
+EXTERN(int) jsimd_can_fdct_islow(void);
+EXTERN(int) jsimd_can_fdct_ifast(void);
+EXTERN(int) jsimd_can_fdct_float(void);
 
-EXTERN(void) jsimd_fdct_islow (DCTELEM *data);
-EXTERN(void) jsimd_fdct_ifast (DCTELEM *data);
-EXTERN(void) jsimd_fdct_float (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_islow(DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast(DCTELEM *data);
+EXTERN(void) jsimd_fdct_float(FAST_FLOAT *data);
 
-EXTERN(int) jsimd_can_quantize (void);
-EXTERN(int) jsimd_can_quantize_float (void);
+EXTERN(int) jsimd_can_quantize(void);
+EXTERN(int) jsimd_can_quantize_float(void);
 
-EXTERN(void) jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                             DCTELEM *workspace);
-EXTERN(void) jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                                   FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                                  FAST_FLOAT *workspace);
 
-EXTERN(int) jsimd_can_idct_2x2 (void);
-EXTERN(int) jsimd_can_idct_4x4 (void);
-EXTERN(int) jsimd_can_idct_6x6 (void);
-EXTERN(int) jsimd_can_idct_12x12 (void);
+EXTERN(int) jsimd_can_idct_2x2(void);
+EXTERN(int) jsimd_can_idct_4x4(void);
+EXTERN(int) jsimd_can_idct_6x6(void);
+EXTERN(int) jsimd_can_idct_12x12(void);
 
-EXTERN(void) jsimd_idct_2x2 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_6x6 (j_decompress_ptr cinfo,
-                             jpeg_component_info *compptr,
-                             JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                             JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12 (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_2x2(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_6x6(j_decompress_ptr cinfo,
+                            jpeg_component_info *compptr, JCOEFPTR coef_block,
+                            JSAMPARRAY output_buf, JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
 
-EXTERN(int) jsimd_can_idct_islow (void);
-EXTERN(int) jsimd_can_idct_ifast (void);
-EXTERN(int) jsimd_can_idct_float (void);
+EXTERN(int) jsimd_can_idct_islow(void);
+EXTERN(int) jsimd_can_idct_ifast(void);
+EXTERN(int) jsimd_can_idct_float(void);
 
-EXTERN(void) jsimd_idct_islow (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
-EXTERN(void) jsimd_idct_ifast (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
-EXTERN(void) jsimd_idct_float (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                               JDIMENSION output_col);
+EXTERN(void) jsimd_idct_islow(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) jsimd_idct_ifast(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
+EXTERN(void) jsimd_idct_float(j_decompress_ptr cinfo,
+                              jpeg_component_info *compptr,
+                              JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                              JDIMENSION output_col);
diff --git a/media/libjpeg/jstdhuff.c b/media/libjpeg/jstdhuff.c
index e202e8e7ec..345b513d4d 100644
--- a/media/libjpeg/jstdhuff.c
+++ b/media/libjpeg/jstdhuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2013, D. R. Commander.
+ * Copyright (C) 2013, 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -17,8 +17,8 @@
  */
 
 LOCAL(void)
-add_huff_table (j_common_ptr cinfo,
-                JHUFF_TBL **htblptr, const UINT8 *bits, const UINT8 *val)
+add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
+               const UINT8 *val)
 /* Define a Huffman table */
 {
   int nsymbols, len;
@@ -29,7 +29,7 @@ add_huff_table (j_common_ptr cinfo,
     return;
 
   /* Copy the number-of-symbols-of-each-code-length counts */
-  MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
+  memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
 
   /* Validate the counts.  We do this here mainly so we can copy the right
    * number of symbols from the val[] array, without risking marching off
@@ -41,8 +41,9 @@ add_huff_table (j_common_ptr cinfo,
   if (nsymbols < 1 || nsymbols > 256)
     ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
 
-  MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
-  MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8));
+  memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
+  memset(&((*htblptr)->huffval[nsymbols]), 0,
+         (256 - nsymbols) * sizeof(UINT8));
 
   /* Initialize sent_table FALSE so table will be written to JPEG file. */
   (*htblptr)->sent_table = FALSE;
@@ -50,71 +51,79 @@ add_huff_table (j_common_ptr cinfo,
 
 
 LOCAL(void)
-std_huff_tables (j_common_ptr cinfo)
+std_huff_tables(j_common_ptr cinfo)
 /* Set up the standard Huffman tables (cf. JPEG standard section K.3) */
 /* IMPORTANT: these are only valid for 8-bit data precision! */
 {
   JHUFF_TBL **dc_huff_tbl_ptrs, **ac_huff_tbl_ptrs;
 
-  static const UINT8 bits_dc_luminance[17] =
-    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_luminance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+  static const UINT8 bits_dc_luminance[17] = {
+    /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_luminance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
 
-  static const UINT8 bits_dc_chrominance[17] =
-    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
-  static const UINT8 val_dc_chrominance[] =
-    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };
+  static const UINT8 bits_dc_chrominance[17] = {
+    /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
+  };
+  static const UINT8 val_dc_chrominance[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+  };
 
-  static const UINT8 bits_ac_luminance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
-  static const UINT8 val_ac_luminance[] =
-    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
-      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
-      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
-      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
-      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
-      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
-      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
-      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
-      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
-      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
-      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
-      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
-      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
-      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
-      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
-      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
-      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
-      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
-      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
+  static const UINT8 bits_ac_luminance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d
+  };
+  static const UINT8 val_ac_luminance[] = {
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
+    0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
+    0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+    0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
+    0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+    0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+    0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+    0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+    0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+    0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
+    0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+    0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
+    0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
+    0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+    0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
+    0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
 
-  static const UINT8 bits_ac_chrominance[17] =
-    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
-  static const UINT8 val_ac_chrominance[] =
-    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
-      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
-      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
-      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
-      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
-      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
-      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
-      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
-      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
-      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
-      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
-      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
-      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
-      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
-      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
-      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
-      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
-      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
-      0xf9, 0xfa };
+  static const UINT8 bits_ac_chrominance[17] = {
+    /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77
+  };
+  static const UINT8 val_ac_chrominance[] = {
+    0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
+    0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
+    0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+    0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
+    0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
+    0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+    0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
+    0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
+    0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+    0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+    0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
+    0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+    0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
+    0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+    0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
+    0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
+    0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+    0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
+    0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
+    0xf9, 0xfa
+  };
 
   if (cinfo->is_decompressor) {
     dc_huff_tbl_ptrs = ((j_decompress_ptr)cinfo)->dc_huff_tbl_ptrs;
diff --git a/media/libjpeg/jutils.c b/media/libjpeg/jutils.c
index f9d35023e5..d86271624a 100644
--- a/media/libjpeg/jutils.c
+++ b/media/libjpeg/jutils.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code
- * relevant to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -53,7 +53,7 @@ const int jpeg_zigzag_order[DCTSIZE2] = {
  * fake entries.
  */
 
-const int jpeg_natural_order[DCTSIZE2+16] = {
+const int jpeg_natural_order[DCTSIZE2 + 16] = {
   0,  1,  8, 16,  9,  2,  3, 10,
  17, 24, 32, 25, 18, 11,  4,  5,
  12, 19, 26, 33, 40, 48, 41, 34,
@@ -72,7 +72,7 @@ const int jpeg_natural_order[DCTSIZE2+16] = {
  */
 
 GLOBAL(long)
-jdiv_round_up (long a, long b)
+jdiv_round_up(long a, long b)
 /* Compute a/b rounded up to next integer, ie, ceil(a/b) */
 /* Assumes a >= 0, b > 0 */
 {
@@ -81,7 +81,7 @@ jdiv_round_up (long a, long b)
 
 
 GLOBAL(long)
-jround_up (long a, long b)
+jround_up(long a, long b)
 /* Compute a rounded up to next multiple of b, ie, ceil(a/b)*b */
 /* Assumes a >= 0, b > 0 */
 {
@@ -91,9 +91,9 @@ jround_up (long a, long b)
 
 
 GLOBAL(void)
-jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
-                   JSAMPARRAY output_array, int dest_row,
-                   int num_rows, JDIMENSION num_cols)
+jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
+                  JSAMPARRAY output_array, int dest_row, int num_rows,
+                  JDIMENSION num_cols)
 /* Copy some rows of samples from one place to another.
  * num_rows rows are copied from input_array[source_row++]
  * to output_array[dest_row++]; these areas may overlap for duplication.
@@ -101,7 +101,7 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
  */
 {
   register JSAMPROW inptr, outptr;
-  register size_t count = (size_t) (num_cols * sizeof(JSAMPLE));
+  register size_t count = (size_t)(num_cols * sizeof(JSAMPLE));
   register int row;
 
   input_array += source_row;
@@ -110,24 +110,24 @@ jcopy_sample_rows (JSAMPARRAY input_array, int source_row,
   for (row = num_rows; row > 0; row--) {
     inptr = *input_array++;
     outptr = *output_array++;
-    MEMCOPY(outptr, inptr, count);
+    memcpy(outptr, inptr, count);
   }
 }
 
 
 GLOBAL(void)
-jcopy_block_row (JBLOCKROW input_row, JBLOCKROW output_row,
-                 JDIMENSION num_blocks)
+jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
+                JDIMENSION num_blocks)
 /* Copy a row of coefficient blocks from one place to another. */
 {
-  MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
+  memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
 }
 
 
 GLOBAL(void)
-jzero_far (void *target, size_t bytestozero)
+jzero_far(void *target, size_t bytestozero)
 /* Zero out a chunk of memory. */
 /* This might be sample-array data, block-array data, or alloc_large data. */
 {
-  MEMZERO(target, bytestozero);
+  memset(target, 0, bytestozero);
 }
diff --git a/media/libjpeg/jversion.h b/media/libjpeg/jversion.h
index 6ce663d82a..63db95b99b 100644
--- a/media/libjpeg/jversion.h
+++ b/media/libjpeg/jversion.h
@@ -2,9 +2,9 @@
  * jversion.h
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2016, D. R. Commander.
+ * Copyright (C) 2010, 2012-2022, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -30,20 +30,25 @@
  * NOTE: It is our convention to place the authors in the following order:
  * - libjpeg-turbo authors (2009-) in descending order of the date of their
  *   most recent contribution to the project, then in ascending order of the
- *   date of their first contribution to the project
+ *   date of their first contribution to the project, then in alphabetical
+ *   order
  * - Upstream authors in descending order of the date of the first inclusion of
  *   their code
  */
 
-#define JCOPYRIGHT      "Copyright (C) 2009-2016 D. R. Commander\n" \
-                        "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
-                        "Copyright (C) 2015-2016 Matthieu Darbois\n" \
-                        "Copyright (C) 2015 Google, Inc.\n" \
-                        "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
-                        "Copyright (C) 2013 Linaro Limited\n" \
-                        "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
-                        "Copyright (C) 2009 Pierre Ossman for Cendio AB\n" \
-                        "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-                        "Copyright (C) 1991-2016 Thomas G. Lane, Guido Vollbeding" \
+#define JCOPYRIGHT \
+  "Copyright (C) 2009-2022 D. R. Commander\n" \
+  "Copyright (C) 2015, 2020 Google, Inc.\n" \
+  "Copyright (C) 2019-2020 Arm Limited\n" \
+  "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
+  "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
+  "Copyright (C) 2015 Intel Corporation\n" \
+  "Copyright (C) 2013-2014 Linaro Limited\n" \
+  "Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
+  "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
+  "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
+  "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
 
-#define JCOPYRIGHT_SHORT "Copyright (C) 1991-2016 The libjpeg-turbo Project and many others"
+#define JCOPYRIGHT_SHORT \
+  "Copyright (C) 1991-2022 The libjpeg-turbo Project and many others"
diff --git a/media/libjpeg/simd/arm/aarch32/jccolext-neon.c b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 0000000000..362102d2b2
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+  const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+  const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+  const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+  const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 8) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 8) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+      uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_low = scaled_128_5;
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+      cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+      uint32x4_t cb_high = scaled_128_5;
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+      cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_low = scaled_128_5;
+      cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+      uint32x4_t cr_high = scaled_128_5;
+      cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+                                      vrshrn_n_u32(y_high, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+                                       vshrn_n_u32(cb_high, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+                                       vshrn_n_u32(cr_high, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+      /* Increment pointers. */
+      inptr += (8 * RGB_PIXELSIZE);
+      outptr0 += 8;
+      outptr1 += 8;
+      outptr2 += 8;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jchuff-neon.c b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 0000000000..19d94f720d
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint8_t block_nbits[DCTSIZE2];
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load rows of coefficients from DCT block in zig-zag order. */
+
+  /* Compute DC coefficient difference value. (F.1.1.5.1) */
+  int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+  row0 = vld1q_lane_s16(block +  1, row0, 1);
+  row0 = vld1q_lane_s16(block +  8, row0, 2);
+  row0 = vld1q_lane_s16(block + 16, row0, 3);
+  row0 = vld1q_lane_s16(block +  9, row0, 4);
+  row0 = vld1q_lane_s16(block +  2, row0, 5);
+  row0 = vld1q_lane_s16(block +  3, row0, 6);
+  row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+  int16x8_t row1 = vld1q_dup_s16(block + 17);
+  row1 = vld1q_lane_s16(block + 24, row1, 1);
+  row1 = vld1q_lane_s16(block + 32, row1, 2);
+  row1 = vld1q_lane_s16(block + 25, row1, 3);
+  row1 = vld1q_lane_s16(block + 18, row1, 4);
+  row1 = vld1q_lane_s16(block + 11, row1, 5);
+  row1 = vld1q_lane_s16(block +  4, row1, 6);
+  row1 = vld1q_lane_s16(block +  5, row1, 7);
+
+  int16x8_t row2 = vld1q_dup_s16(block + 12);
+  row2 = vld1q_lane_s16(block + 19, row2, 1);
+  row2 = vld1q_lane_s16(block + 26, row2, 2);
+  row2 = vld1q_lane_s16(block + 33, row2, 3);
+  row2 = vld1q_lane_s16(block + 40, row2, 4);
+  row2 = vld1q_lane_s16(block + 48, row2, 5);
+  row2 = vld1q_lane_s16(block + 41, row2, 6);
+  row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+  int16x8_t row3 = vld1q_dup_s16(block + 27);
+  row3 = vld1q_lane_s16(block + 20, row3, 1);
+  row3 = vld1q_lane_s16(block + 13, row3, 2);
+  row3 = vld1q_lane_s16(block +  6, row3, 3);
+  row3 = vld1q_lane_s16(block +  7, row3, 4);
+  row3 = vld1q_lane_s16(block + 14, row3, 5);
+  row3 = vld1q_lane_s16(block + 21, row3, 6);
+  row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  int16x8_t row1_lz = vclzq_s16(abs_row1);
+  int16x8_t row2_lz = vclzq_s16(abs_row2);
+  int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+  uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+  uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+  uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+  vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+  vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+  vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+  vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+  uint16x8_t row0_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+              vnegq_s16(row0_lz));
+  uint16x8_t row1_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+              vnegq_s16(row1_lz));
+  uint16x8_t row2_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+              vnegq_s16(row2_lz));
+  uint16x8_t row3_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+              vnegq_s16(row3_lz));
+
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+  uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+  uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+  /* Store diff values for rows 0, 1, 2, and 3. */
+  vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+  vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+  vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+  vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+  /* Load last four rows of coefficients from DCT block in zig-zag order. */
+  int16x8_t row4 = vld1q_dup_s16(block + 35);
+  row4 = vld1q_lane_s16(block + 42, row4, 1);
+  row4 = vld1q_lane_s16(block + 49, row4, 2);
+  row4 = vld1q_lane_s16(block + 56, row4, 3);
+  row4 = vld1q_lane_s16(block + 57, row4, 4);
+  row4 = vld1q_lane_s16(block + 50, row4, 5);
+  row4 = vld1q_lane_s16(block + 43, row4, 6);
+  row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+  int16x8_t row5 = vld1q_dup_s16(block + 29);
+  row5 = vld1q_lane_s16(block + 22, row5, 1);
+  row5 = vld1q_lane_s16(block + 15, row5, 2);
+  row5 = vld1q_lane_s16(block + 23, row5, 3);
+  row5 = vld1q_lane_s16(block + 30, row5, 4);
+  row5 = vld1q_lane_s16(block + 37, row5, 5);
+  row5 = vld1q_lane_s16(block + 44, row5, 6);
+  row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+  int16x8_t row6 = vld1q_dup_s16(block + 58);
+  row6 = vld1q_lane_s16(block + 59, row6, 1);
+  row6 = vld1q_lane_s16(block + 52, row6, 2);
+  row6 = vld1q_lane_s16(block + 45, row6, 3);
+  row6 = vld1q_lane_s16(block + 38, row6, 4);
+  row6 = vld1q_lane_s16(block + 31, row6, 5);
+  row6 = vld1q_lane_s16(block + 39, row6, 6);
+  row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+  int16x8_t row7 = vld1q_dup_s16(block + 53);
+  row7 = vld1q_lane_s16(block + 60, row7, 1);
+  row7 = vld1q_lane_s16(block + 61, row7, 2);
+  row7 = vld1q_lane_s16(block + 54, row7, 3);
+  row7 = vld1q_lane_s16(block + 47, row7, 4);
+  row7 = vld1q_lane_s16(block + 55, row7, 5);
+  row7 = vld1q_lane_s16(block + 62, row7, 6);
+  row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  int16x8_t row4_lz = vclzq_s16(abs_row4);
+  int16x8_t row5_lz = vclzq_s16(abs_row5);
+  int16x8_t row6_lz = vclzq_s16(abs_row6);
+  int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+  uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+  uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+  uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+  vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+  vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+  vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+  vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+  uint16x8_t row4_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+              vnegq_s16(row4_lz));
+  uint16x8_t row5_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+              vnegq_s16(row5_lz));
+  uint16x8_t row6_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+              vnegq_s16(row6_lz));
+  uint16x8_t row7_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+              vnegq_s16(row7_lz));
+
+  uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+  uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+  uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+  uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+  /* Store diff values for rows 4, 5, 6, and 7. */
+  vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+  vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+  vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+  vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+  uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+  uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+  uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+  uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+  uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+  uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+  uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+  row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+  row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+  row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+  row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+  row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+  row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+  row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+  row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+  /* Move bitmap to 32-bit scalar registers. */
+  uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+  uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  unsigned int nbits = block_nbits[0];
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = block_diff[0];
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  while (bitmap_1_32 != 0) {
+    r = BUILTIN_CLZ(bitmap_1_32);
+    i += r;
+    bitmap_1_32 <<= r;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    i++;
+    bitmap_1_32 <<= 1;
+  }
+
+  r = 33 - i;
+  i = 33;
+
+  while (bitmap_33_63 != 0) {
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+    r += leading_zeros;
+    i += leading_zeros;
+    bitmap_33_63 <<= leading_zeros;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    r = 0;
+    i++;
+    bitmap_33_63 <<= 1;
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd.c b/media/libjpeg/simd/arm/aarch32/jsimd.c
new file mode 100644
index 0000000000..e3adf23d50
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd.c
@@ -0,0 +1,980 @@
+/*
+ * jsimd_arm.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "Features", 8) != 0)
+    return 0;
+  buffer += 8;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "neon"))
+        simd_support |= JSIMD_NEON;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if !defined(__ARM_NEON__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ARM_NEON__)
+  simd_support |= JSIMD_NEON;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  /* We still have a chance to use Neon regardless of globally used
+   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux/android */
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_ycc_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_ycc_extbgr_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_ycc_extrgb_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/arm/aarch32/jsimd_neon.S b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 0000000000..7e1e2b1451
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+  JLONG   q1, q2, q3, q4, q5, q6, q7; \
+  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+  \
+  /* 1-D iDCT input data */ \
+  row0 = xrow0; \
+  row1 = xrow1; \
+  row2 = xrow2; \
+  row3 = xrow3; \
+  row4 = xrow4; \
+  row5 = xrow5; \
+  row6 = xrow6; \
+  row7 = xrow7; \
+  \
+  q5 = row7 + row3; \
+  q4 = row5 + row1; \
+  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+       MULTIPLY(q4, FIX_1_175875602); \
+  q7 = MULTIPLY(q5, FIX_1_175875602) + \
+       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+  q2 = MULTIPLY(row2, FIX_0_541196100) + \
+       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+  q4 = q6; \
+  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+  /* now we can use q1 (reloadable constants have been used up) */ \
+  q1 = q3 + q2; \
+  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+        MULTIPLY(row1, -FIX_0_899976223); \
+  q5 = q7; \
+  q1 = q1 + q6; \
+  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+  \
+  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+  tmp11_plus_tmp2 = q1; \
+  row1 = 0; \
+  \
+  q1 = q1 - q6; \
+  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+        MULTIPLY(row3, -FIX_2_562915447); \
+  q1 = q1 - q6; \
+  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+       MULTIPLY(row6, FIX_0_541196100); \
+  q3 = q3 - q2; \
+  \
+  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+  tmp11_minus_tmp2 = q1; \
+  \
+  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+  q2 = q1 + q6; \
+  q1 = q1 - q6; \
+  \
+  /* pick up the results */ \
+  tmp0  = q4; \
+  tmp1  = q5; \
+  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+  tmp3  = q7; \
+  tmp10 = q2; \
+  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+  tmp12 = q3; \
+  tmp13 = q1; \
+}
+
+#define XFIX_0_899976223                    d0[0]
+#define XFIX_0_541196100                    d0[1]
+#define XFIX_2_562915447                    d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
+#define XFIX_1_175875602                    d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    ROW0L           .req d16
+    ROW0R           .req d17
+    ROW1L           .req d18
+    ROW1R           .req d19
+    ROW2L           .req d20
+    ROW2R           .req d21
+    ROW3L           .req d22
+    ROW3R           .req d23
+    ROW4L           .req d24
+    ROW4R           .req d25
+    ROW5L           .req d26
+    ROW5R           .req d27
+    ROW6L           .req d28
+    ROW6R           .req d29
+    ROW7L           .req d30
+    ROW7R           .req d31
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_islow_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
+    add             ip, ip, #16
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d15}                    /* save Neon registers */
+    /* 1-D IDCT, pass 1, left 4x8 half */
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
+      /* Check for the zero coefficients in the right 4x8 half */
+      push            {r4, r5}
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
+      pop             {r4, r5}
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
+
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
+
+    /* 1-D IDCT, pass 1, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
+      /* Transpose left 4x8 half */
+      vtrn.16         ROW6L, ROW7L
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
+      vtrn.16         ROW2L, ROW3L
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
+      vtrn.16         ROW0L, ROW1L
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+      vtrn.16         ROW4L, ROW5L
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+      vtrn.32         ROW1L, ROW3L
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
+      vtrn.32         ROW4L, ROW6L
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      vtrn.32         ROW0L, ROW2L
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
+      vtrn.32         ROW5L, ROW7L
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
+    /* Transpose right 4x8 half */
+    vtrn.16         ROW6R, ROW7R
+    vtrn.16         ROW2R, ROW3R
+    vtrn.16         ROW0R, ROW1R
+    vtrn.16         ROW4R, ROW5R
+    vtrn.32         ROW1R, ROW3R
+    vtrn.32         ROW4R, ROW6R
+    vtrn.32         ROW0R, ROW2R
+    vtrn.32         ROW5R, ROW7R
+
+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+
+2:  /* Descale to 8-bit and range limit */
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8 - d15}                    /* restore Neon registers */
+    vqrshrn.s16     d20, q12, #2
+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
+        /* Store results to the output buffer */
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d16}, [TMP1]
+      vtrn.8          d20, d21
+        vst1.8          {d17}, [TMP2]
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d18}, [TMP1]
+      vadd.u8         q10, q10, q0
+        vst1.8          {d19}, [TMP2]
+        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        add             TMP3, TMP3, OUTPUT_COL
+        add             TMP4, TMP4, OUTPUT_COL
+      vtrn.8          d22, d23
+        vst1.8          {d20}, [TMP1]
+      vadd.u8         q11, q11, q0
+        vst1.8          {d21}, [TMP2]
+        vst1.8          {d22}, [TMP3]
+        vst1.8          {d23}, [TMP4]
+    bx              lr
+
+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+    /* Transpose left 4x8 half */
+    vtrn.16         ROW6L, ROW7L
+    vtrn.16         ROW2L, ROW3L
+    vtrn.16         ROW0L, ROW1L
+    vtrn.16         ROW4L, ROW5L
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
+    vtrn.32         ROW1L, ROW3L
+    vtrn.32         ROW4L, ROW6L
+    vtrn.32         ROW0L, ROW2L
+    vtrn.32         ROW5L, ROW7L
+
+    cmp             r0, #0
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
+
+    /* Only row 0 is non-zero for the right 4x8 half  */
+    vdup.s16        ROW1R, ROW0R[1]
+    vdup.s16        ROW2R, ROW0R[2]
+    vdup.s16        ROW3R, ROW0R[3]
+    vdup.s16        ROW4R, ROW0R[0]
+    vdup.s16        ROW5R, ROW0R[1]
+    vdup.s16        ROW6R, ROW0R[2]
+    vdup.s16        ROW7R, ROW0R[3]
+    vdup.s16        ROW0R, ROW0R[0]
+    b               1b  /* Go to 'normal' second pass */
+
+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+    .unreq          ROW0L
+    .unreq          ROW0R
+    .unreq          ROW1L
+    .unreq          ROW1R
+    .unreq          ROW2L
+    .unreq          ROW2R
+    .unreq          ROW3L
+    .unreq          ROW3R
+    .unreq          ROW4L
+    .unreq          ROW4R
+    .unreq          ROW5L
+    .unreq          ROW5R
+    .unreq          ROW6L
+    .unreq          ROW6R
+    .unreq          ROW7L
+    .unreq          ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200  d0[0]
+#define XFIX_1_414213562  d0[1]
+#define XFIX_1_847759065  d0[2]
+#define XFIX_2_613125930  d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_ifast_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0}, [ip, :64]  /* load constants */
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d13}       /* save Neon registers */
+    /* 1-D IDCT, pass 1 */
+    vsub.s16        q2, q10, q14
+    vadd.s16        q14, q10, q14
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vadd.s16        q10, q10, q2
+      /* Transpose */
+      vtrn.16         q8, q9
+    vsub.s16        q11, q12, q1
+      vtrn.16         q14, q15
+    vadd.s16        q12, q12, q1
+      vtrn.16         q10, q11
+      vtrn.16         q12, q13
+      vtrn.32         q9, q11
+      vtrn.32         q12, q14
+      vtrn.32         q8, q10
+      vtrn.32         q13, q15
+      vswp            d28, d21
+      vswp            d26, d19
+    /* 1-D IDCT, pass 2 */
+    vsub.s16        q2, q10, q14
+      vswp            d30, d23
+    vadd.s16        q14, q10, q14
+      vswp            d24, d17
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vpop            {d8 - d13}    /* restore Neon registers */
+    vadd.s16        q10, q10, q2
+    vsub.s16        q11, q12, q1
+    vadd.s16        q12, q12, q1
+    /* Descale to 8-bit and range limit */
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
+    vqshrn.s16      d18, q10, #5
+    vqshrn.s16      d19, q11, #5
+    vqshrn.s16      d20, q12, #5
+    vqshrn.s16      d21, q13, #5
+    vqshrn.s16      d22, q14, #5
+    vqshrn.s16      d23, q15, #5
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
+    vadd.u8         q10, q10, q0
+    vadd.u8         q11, q11, q0
+    /* Transpose the final 8-bit samples */
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+      /* Store results to the output buffer */
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d16}, [TMP1]
+      vst1.8          {d17}, [TMP2]
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d18}, [TMP1]
+    vtrn.8          d20, d21
+      vst1.8          {d19}, [TMP2]
+      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      add             TMP3, TMP3, OUTPUT_COL
+      add             TMP4, TMP4, OUTPUT_COL
+      vst1.8          {d20}, [TMP1]
+    vtrn.8          d22, d23
+      vst1.8          {d21}, [TMP2]
+      vst1.8          {d22}, [TMP3]
+      vst1.8          {d23}, [TMP4]
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+    OUTPUT_WIDTH    .req r0
+    INPUT_BUF       .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_ROW      .req r3
+    NUM_ROWS        .req r4
+
+    OUTPUT_BUF0     .req r5
+    OUTPUT_BUF1     .req r6
+    OUTPUT_BUF2     .req OUTPUT_BUF
+
+    RGB             .req r7
+    Y               .req r8
+    U               .req r9
+    V               .req r10
+    N               .req ip
+
+    /* Load constants to d0, d1, d2, d3 */
+    adr             ip, jsimd_\colorid\()_ycc_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]
+
+    /* Save Arm registers and handle input arguments */
+    push            {r4, r5, r6, r7, r8, r9, r10, lr}
+    ldr             NUM_ROWS, [sp, #(4 * 8)]
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    vpush           {d8 - d15}
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #4
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             3f
+    do_load         \bpp, 8
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    blt             2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1
+    subs            N, N, #8
+    bge             1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    beq             8f
+3:
+    tst             N, #4
+    beq             3f
+    do_load         \bpp, 4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         \bpp, 2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         \bpp, 1
+5:
+    do_rgb_to_yuv
+    tst             N, #4
+    beq             6f
+    do_store        4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    vpop            {d8 - d15}
+    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/media/libjpeg/simd/arm/aarch64/jccolext-neon.c b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 0000000000..37130c225e
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+  const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr0 += 16;
+      outptr1 += 16;
+      outptr2 += 16;
+    }
+
+    if (cols_remaining > 8) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+    } else if (cols_remaining > 0) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+      uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_l = scaled_128_5;
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+      cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+      uint32x4_t cb_h = scaled_128_5;
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+      cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_l = scaled_128_5;
+      cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+      uint32x4_t cr_h = scaled_128_5;
+      cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+                                      vrshrn_n_u32(y_h, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+                                       vshrn_n_u32(cb_h, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+                                       vshrn_n_u32(cr_h, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jchuff-neon.c b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 0000000000..607a116070
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,411 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, 2022, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+    0,   1,   2,   3,  16,  17,  32,  33,
+   18,  19,   4,   5,   6,   7,  20,  21,
+   34,  35,  48,  49, 255, 255,  50,  51,
+   36,  37,  22,  23,   8,   9,  10,  11,
+  255, 255,   6,   7,  20,  21,  34,  35,
+   48,  49, 255, 255,  50,  51,  36,  37,
+   54,  55,  40,  41,  26,  27,  12,  13,
+   14,  15,  28,  29,  42,  43,  56,  57,
+    6,   7,  20,  21,  34,  35,  48,  49,
+   50,  51,  36,  37,  22,  23,   8,   9,
+   26,  27,  12,  13, 255, 255,  14,  15,
+   28,  29,  42,  43,  56,  57, 255, 255,
+   52,  53,  54,  55,  40,  41,  26,  27,
+   12,  13, 255, 255,  14,  15,  28,  29,
+   26,  27,  40,  41,  42,  43,  28,  29,
+   14,  15,  30,  31,  44,  45,  46,  47
+};
+
+/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
+ * address warning because the macro sometimes writes a 64-bit value to a
+ * non-64-bit-aligned address.  That behavior is technically undefined per
+ * the C specification, but it is supported by the AArch64 architecture and
+ * compilers.
+ */
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("alignment")))
+#endif
+#endif
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const uint8x16x4_t idx_rows_0123 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+  const uint8x16x4_t idx_rows_4567 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+  /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+  const uint8x16x4_t idx_rows_0123 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+  } };
+  const uint8x16x4_t idx_rows_4567 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+  } };
+#endif
+
+  /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const int8x16x4_t tbl_rows_0123 =
+    vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+  const int8x16x4_t tbl_rows_4567 =
+    vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+  const int8x16x4_t tbl_rows_0123 = { {
+    vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+  } };
+  const int8x16x4_t tbl_rows_4567 = { {
+    vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+  } };
+#endif
+
+  /* Initialise extra lookup tables. */
+  const int8x16x4_t tbl_rows_2345 = { {
+    tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+    tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+  } };
+  const int8x16x3_t tbl_rows_567 =
+    { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+  /* Shuffle coefficients into zig-zag order. */
+  int16x8_t row0 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+  int16x8_t row1 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+  int16x8_t row2 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+  int16x8_t row3 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+  int16x8_t row4 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+  int16x8_t row5 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+  int16x8_t row6 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+  int16x8_t row7 =
+    vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+  /* Compute DC coefficient difference value (F.1.1.5.1). */
+  row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+  /* Initialize AC coefficient lanes not reachable by lookup tables. */
+  row1 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+                                  0), row1, 2);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  4), row2, 0);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  0), row2, 5);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  7), row5, 2);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  3), row5, 7);
+  row6 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+                                  7), row6, 5);
+
+  /* DCT block is now in zig-zag order; start Huffman encoding process. */
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
+  uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
+  uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
+  uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
+  uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
+  uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
+  uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
+  uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
+
+  uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
+                                    vreinterpretq_u8_u16(row0_ne_0));
+  uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
+                                    vreinterpretq_u8_u16(row2_ne_0));
+  uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
+                                    vreinterpretq_u8_u16(row4_ne_0));
+  uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
+                                    vreinterpretq_u8_u16(row6_ne_0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x16_t bitmap_mask =
+    vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
+
+  uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
+  uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
+
+  uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
+                                              bitmap_rows_3210);
+  uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
+                                  vget_high_u8(bitmap_rows_76543210));
+
+  /* Shift left to remove DC bit. */
+  bitmap_all =
+    vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+  /* Count bits set (number of non-zero coefficients) in bitmap. */
+  unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+  /* Move bitmap to 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  /* Find nbits required to specify sign and amplitude of coefficient. */
+  unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
+  unsigned int nbits = 16 - lz;
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = vgetq_lane_u16(row0_diff, 0);
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  /* The most efficient method of computing nbits and diff depends on the
+   * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
+   * non-zero AC coefficients), it is beneficial to do all of the work using
+   * Neon; else we do some of the work using Neon and the rest on demand using
+   * scalar code.
+   */
+  if (non_zero_coefficients > 8) {
+    uint8_t block_nbits[DCTSIZE2];
+
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    int16x8_t row1_lz = vclzq_s16(abs_row1);
+    int16x8_t row2_lz = vclzq_s16(abs_row2);
+    int16x8_t row3_lz = vclzq_s16(abs_row3);
+    int16x8_t row4_lz = vclzq_s16(abs_row4);
+    int16x8_t row5_lz = vclzq_s16(abs_row5);
+    int16x8_t row6_lz = vclzq_s16(abs_row6);
+    int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Narrow leading zero count to 8 bits. */
+    uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
+                                    vreinterpretq_u8_s16(row1_lz));
+    uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
+                                    vreinterpretq_u8_s16(row3_lz));
+    uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
+                                    vreinterpretq_u8_s16(row5_lz));
+    uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
+                                    vreinterpretq_u8_s16(row7_lz));
+    /* Compute nbits needed to specify magnitude of each coefficient. */
+    uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
+    uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
+    uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
+    uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
+    /* Store nbits. */
+    vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
+    vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
+    vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
+    vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
+    /* Mask bits not required to specify sign and amplitude of diff. */
+    uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
+    uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
+    uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
+    uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
+    uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
+    uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
+    uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
+    /* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     row1_mask);
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     row2_mask);
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     row3_mask);
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     row4_mask);
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     row5_mask);
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     row6_mask);
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     row7_mask);
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      nbits = block_nbits[i];
+      diff = block_diff[i];
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  } else if (bitmap != 0) {
+    uint16_t block_abs[DCTSIZE2];
+    /* Compute and store absolute value of coefficients. */
+    int16x8_t abs_row1 = vabsq_s16(row1);
+    int16x8_t abs_row2 = vabsq_s16(row2);
+    int16x8_t abs_row3 = vabsq_s16(row3);
+    int16x8_t abs_row4 = vabsq_s16(row4);
+    int16x8_t abs_row5 = vabsq_s16(row5);
+    int16x8_t abs_row6 = vabsq_s16(row6);
+    int16x8_t abs_row7 = vabsq_s16(row7);
+    vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+    vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+    vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+    vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+    vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+    vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+    vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+    vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+    /* Compute diff bits (without nbits mask) and store. */
+    uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
+                                     vcltzq_s16(row1));
+    uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
+                                     vcltzq_s16(row2));
+    uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
+                                     vcltzq_s16(row3));
+    uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
+                                     vcltzq_s16(row4));
+    uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
+                                     vcltzq_s16(row5));
+    uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
+                                     vcltzq_s16(row6));
+    uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
+                                     vcltzq_s16(row7));
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    /* Same as above but must mask diff bits and compute nbits on demand. */
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      lz = BUILTIN_CLZ(block_abs[i]);
+      nbits = 32 - lz;
+      diff = ((unsigned int)block_diff[i] << lz) >> lz;
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/media/libjpeg/simd/arm/aarch64/jsimd.c b/media/libjpeg/simd/arm/aarch64/jsimd.c
new file mode 100644
index 0000000000..604d5472f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/aarch64/jsimd.c
@@ -0,0 +1,1058 @@
+/*
+ * jsimd_arm64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit Arm architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "jconfigint.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#define JSIMD_FASTLD3  1
+#define JSIMD_FASTST3  2
+#define JSIMD_FASTTBL  4
+
+static unsigned int simd_support = ~0;
+static unsigned int simd_huffman = 1;
+static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
+                                    JSIMD_FASTTBL;
+
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_cpuinfo(char *buffer, const char *field, char *value)
+{
+  char *p;
+
+  if (*value == 0)
+    return 0;
+  if (strncmp(buffer, field, strlen(field)) != 0)
+    return 0;
+  buffer += strlen(field);
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'value' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, value))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(value);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
+          check_cpuinfo(buffer, "CPU part", "0xd07"))
+        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
+           percent speedup by disabling the use of that instruction.  The
+           speedup on Cortex-A57 is more subtle but still measurable. */
+        simd_features &= ~JSIMD_FASTTBL;
+      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
+        /* The SIMD version of Huffman encoding is slower than the C version on
+           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
+           CPU. */
+        simd_huffman = simd_features = 0;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+
+/*
+ * Armv8 architectures support Neon extensions by default.
+ * It is no longer optional as it was with Armv7.
+ */
+
+
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+  simd_support |= JSIMD_NEON;
+#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCENEON") && !strcmp(env, "1"))
+    simd_support = JSIMD_NEON;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTLD3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTLD3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "1"))
+    simd_features |= JSIMD_FASTST3;
+  if (!GETENV_S(env, 2, "JSIMD_FASTST3") && !strcmp(env, "0"))
+    simd_features &= ~JSIMD_FASTST3;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_ycc_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_ycc_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_ycc_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_ycc_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTLD3)
+#endif
+      neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_ycc_extrgbx_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_ycc_extbgrx_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_ycc_extxbgr_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_ycc_extxrgb_convert_neon;
+    break;
+  default:
+#ifndef NEON_INTRINSICS
+    if (simd_features & JSIMD_FASTST3)
+#endif
+      neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
+    else
+      neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
+    break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
+                                output_buf, num_rows);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_neon(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_neon(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_neon(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON && simd_huffman)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+#ifndef NEON_INTRINSICS
+  if (simd_features & JSIMD_FASTTBL)
+#endif
+    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
+                                            dctbl, actbl);
+#ifndef NEON_INTRINSICS
+  else
+    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
+                                                    last_dc_val, dctbl, actbl);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/jsimd_arm64_neon.S b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
similarity index 60%
rename from media/libjpeg/simd/jsimd_arm64_neon.S
rename to media/libjpeg/simd/arm/aarch64/jsimd_neon.S
index 3309858241..738a4f0658 100644
--- a/media/libjpeg/simd/jsimd_arm64_neon.S
+++ b/media/libjpeg/simd/arm/aarch64/jsimd_neon.S
@@ -1,13 +1,13 @@
 /*
- * ARMv8 NEON optimizations for libjpeg-turbo
+ * Armv8 Neon optimizations for libjpeg-turbo
  *
  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
  * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
- * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
- * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
+ * Author:  Ragesh Radhakrishnan <ragesh.r@linaro.org>
+ * Copyright (C) 2014-2016, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
  * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
@@ -31,125 +31,28 @@
 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
 #endif
 
-.text
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
-    .globl _\fname
-_\fname:
+#if defined(__APPLE__)
+.section __DATA, __const
+#elif defined(_WIN32)
+.section .rdata
 #else
-    .global \fname
-#ifdef __ELF__
-    .hidden \fname
-    .type \fname, %function
+.section .rodata, "a", %progbits
 #endif
-\fname:
-#endif
-.endm
 
-/* Transpose elements of single 128 bit registers */
-.macro transpose_single x0, x1, xi, xilen, literal
-    ins             \xi\xilen[0], \x0\xilen[0]
-    ins             \x1\xilen[0], \x0\xilen[1]
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
+/* Constants for jsimd_idct_islow_neon() */
 
-/* Transpose elements of 2 differnet registers */
-.macro transpose x0, x1, xi, xilen, literal
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x2\x2len
-    trn2            \x2\x2len, \xi\x0len, \x2\x2len
-    mov             \xi\xilen, \x1\xilen
-    trn1            \x1\x1len, \x1\x1len, \x3\x3len
-    trn2            \x3\x3len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x1\x1len
-    trn2            \x1\x2len, \xi\x0len, \x1\x2len
-    mov             \xi\xilen, \x2\xilen
-    trn1            \x2\x2len, \x2\x2len, \x3\x3len
-    trn2            \x3\x2len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4 x0, x1, x2, x3, x5
-    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
-    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
-.endm
-
-.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
-    trn1            \t0\().8h, \l0\().8h, \l1\().8h
-    trn1            \t1\().8h, \l2\().8h, \l3\().8h
-    trn1            \t2\().8h, \l4\().8h, \l5\().8h
-    trn1            \t3\().8h, \l6\().8h, \l7\().8h
-    trn2            \l1\().8h, \l0\().8h, \l1\().8h
-    trn2            \l3\().8h, \l2\().8h, \l3\().8h
-    trn2            \l5\().8h, \l4\().8h, \l5\().8h
-    trn2            \l7\().8h, \l6\().8h, \l7\().8h
-
-    trn1            \l4\().4s, \t2\().4s, \t3\().4s
-    trn2            \t3\().4s, \t2\().4s, \t3\().4s
-    trn1            \t2\().4s, \t0\().4s, \t1\().4s
-    trn2            \l2\().4s, \t0\().4s, \t1\().4s
-    trn1            \t0\().4s, \l1\().4s, \l3\().4s
-    trn2            \l3\().4s, \l1\().4s, \l3\().4s
-    trn2            \t1\().4s, \l5\().4s, \l7\().4s
-    trn1            \l5\().4s, \l5\().4s, \l7\().4s
-
-    trn2            \l6\().2d, \l2\().2d, \t3\().2d
-    trn1            \l0\().2d, \t2\().2d, \l4\().2d
-    trn1            \l1\().2d, \t0\().2d, \l5\().2d
-    trn2            \l7\().2d, \l3\().2d, \t1\().2d
-    trn1            \l2\().2d, \l2\().2d, \t3\().2d
-    trn2            \l4\().2d, \t2\().2d, \l4\().2d
-    trn1            \l3\().2d, \l3\().2d, \t1\().2d
-    trn2            \l5\().2d, \t0\().2d, \l5\().2d
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- *                        JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-
-#define F_0_298  2446  /* FIX(0.298631336) */
-#define F_0_390  3196  /* FIX(0.390180644) */
-#define F_0_541  4433  /* FIX(0.541196100) */
-#define F_0_765  6270  /* FIX(0.765366865) */
-#define F_0_899  7373  /* FIX(0.899976223) */
-#define F_1_175  9633  /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
 
 .balign 16
 Ljsimd_idct_islow_neon_consts:
@@ -183,18 +86,191 @@ Ljsimd_idct_islow_neon_consts:
 #undef F_2_562
 #undef F_3_072
 
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
+/* Constants for jsimd_ycc_*_neon() */
+
+.balign 16
+Ljsimd_ycc_rgb_neon_consts:
+  .short 0,      0,     0,      0
+  .short 22971, -11277, -23401, 29033
+  .short -128,  -128,   -128,   -128
+  .short -128,  -128,   -128,   -128
+
+/* Constants for jsimd_*_ycc_neon() */
+
+.balign 16
+Ljsimd_rgb_ycc_neon_consts:
+  .short 19595, 38470, 7471, 11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128, 32767, 128
+  .short 32767, 128, 32767, 128
+
+/* Constants for jsimd_fdct_islow_neon() */
+
+#define F_0_298   2446  /* FIX(0.298631336) */
+#define F_0_390   3196  /* FIX(0.390180644) */
+#define F_0_541   4433  /* FIX(0.541196100) */
+#define F_0_765   6270  /* FIX(0.765366865) */
+#define F_0_899   7373  /* FIX(0.899976223) */
+#define F_1_175   9633  /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+.balign 16
+Ljsimd_fdct_islow_neon_consts:
+  .short F_0_298
+  .short -F_0_390
+  .short F_0_541
+  .short F_0_765
+  .short - F_0_899
+  .short F_1_175
+  .short F_1_501
+  .short - F_1_847
+  .short - F_1_961
+  .short F_2_053
+  .short - F_2_562
+  .short F_3_072
+  .short 0          /* padding */
+  .short 0
+  .short 0
+  .short 0
+
+#undef F_0_298
+#undef F_0_390
+#undef F_0_541
+#undef F_0_765
+#undef F_0_899
+#undef F_1_175
+#undef F_1_501
+#undef F_1_847
+#undef F_1_961
+#undef F_2_053
+#undef F_2_562
+#undef F_3_072
+
+/* Constants for jsimd_huff_encode_one_block_neon() */
+
+.balign 16
+Ljsimd_huff_encode_one_block_neon_consts:
+    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
+          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
+    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
+            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
+    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
+            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
+    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
+           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
+    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
+            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
+    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
+            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
+    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
+            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
+    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
+            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
+    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
+            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
+    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
+    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
+           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
+    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
+           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
+
+.text
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+/* Get symbol location */
+.macro get_symbol_loc reg, symbol
+#ifdef __APPLE__
+    adrp            \reg, \symbol@PAGE
+    add             \reg, \reg, \symbol@PAGEOFF
+#else
+    adrp            \reg, \symbol
+    add             \reg, \reg, :lo12:\symbol
+#endif
+.endm
+
+.macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
+    trn1            \t0\().8h, \l0\().8h, \l1\().8h
+    trn1            \t1\().8h, \l2\().8h, \l3\().8h
+    trn1            \t2\().8h, \l4\().8h, \l5\().8h
+    trn1            \t3\().8h, \l6\().8h, \l7\().8h
+    trn2            \l1\().8h, \l0\().8h, \l1\().8h
+    trn2            \l3\().8h, \l2\().8h, \l3\().8h
+    trn2            \l5\().8h, \l4\().8h, \l5\().8h
+    trn2            \l7\().8h, \l6\().8h, \l7\().8h
+
+    trn1            \l4\().4s, \t2\().4s, \t3\().4s
+    trn2            \t3\().4s, \t2\().4s, \t3\().4s
+    trn1            \t2\().4s, \t0\().4s, \t1\().4s
+    trn2            \l2\().4s, \t0\().4s, \t1\().4s
+    trn1            \t0\().4s, \l1\().4s, \l3\().4s
+    trn2            \l3\().4s, \l1\().4s, \l3\().4s
+    trn2            \t1\().4s, \l5\().4s, \l7\().4s
+    trn1            \l5\().4s, \l5\().4s, \l7\().4s
+
+    trn2            \l6\().2d, \l2\().2d, \t3\().2d
+    trn1            \l0\().2d, \t2\().2d, \l4\().2d
+    trn1            \l1\().2d, \t0\().2d, \l5\().2d
+    trn2            \l7\().2d, \l3\().2d, \t1\().2d
+    trn1            \l2\().2d, \l2\().2d, \t3\().2d
+    trn2            \l4\().2d, \t2\().2d, \l4\().2d
+    trn1            \l3\().2d, \l3\().2d, \t1\().2d
+    trn2            \l5\().2d, \t0\().2d, \l5\().2d
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
 
 asm_function jsimd_idct_islow_neon
     DCT_TABLE       .req x0
@@ -216,7 +292,7 @@ asm_function jsimd_idct_islow_neon
     uxtw x3, w3
 
     sub             sp, sp, #64
-    adr             x15, Ljsimd_idct_islow_neon_consts
+    get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
     mov             x10, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
@@ -292,8 +368,8 @@ asm_function jsimd_idct_islow_neon
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -323,20 +399,20 @@ asm_function jsimd_idct_islow_neon
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -380,40 +456,40 @@ asm_function jsimd_idct_islow_neon
     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
-    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
+    shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
     movi            v0.16b, #(CENTERJSAMPLE)
-    /* Prepare pointers (dual-issue with NEON instructions) */
+    /* Prepare pointers (dual-issue with Neon instructions) */
       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v28.8b, v2.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
-    sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v29.8b, v3.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP1, TMP1, OUTPUT_COL
-    sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v30.8b, v4.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP2, TMP2, OUTPUT_COL
-    sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn         v31.8b, v5.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP3, TMP3, OUTPUT_COL
-    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v28.16b, v6.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP4, TMP4, OUTPUT_COL
-    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v29.16b, v7.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
-    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v30.16b, v8.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
-    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
+    sqrshrn2        v31.16b, v9.8h, #(CONST_BITS + PASS1_BITS + 3 - 16)
       add             TMP5, TMP5, OUTPUT_COL
     add             v16.16b, v28.16b, v0.16b
       add             TMP6, TMP6, OUTPUT_COL
@@ -474,7 +550,7 @@ asm_function jsimd_idct_islow_neon
     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
@@ -496,10 +572,10 @@ asm_function jsimd_idct_islow_neon
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
@@ -525,14 +601,14 @@ asm_function jsimd_idct_islow_neon
     add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
     sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
 
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     mov             v6.16b, v15.16b
     mov             v7.16b, v15.16b
     mov             v8.16b, v15.16b
@@ -551,7 +627,7 @@ asm_function jsimd_idct_islow_neon
     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
@@ -574,10 +650,10 @@ asm_function jsimd_idct_islow_neon
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -609,14 +685,14 @@ asm_function jsimd_idct_islow_neon
     mov             v3.16b, v14.16b
     mov             v4.16b, v14.16b
     mov             v5.16b, v14.16b
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     b               1b
 
 .balign 16
@@ -631,8 +707,8 @@ asm_function jsimd_idct_islow_neon
     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
     mov             v21.16b, v19.16b               /* tmp3 = z1 */
     mov             v20.16b, v18.16b               /* tmp3 = z1 */
-    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
-    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
+    smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
+    smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
@@ -662,20 +738,20 @@ asm_function jsimd_idct_islow_neon
     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
-    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
-    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
-    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
-    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
+    smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
+    smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
+    smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
+    smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
 
     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
@@ -719,22 +795,22 @@ asm_function jsimd_idct_islow_neon
     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
 
-    rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
-    rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
-    rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
-    rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
-    rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v2.4h, v18.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v3.4h, v22.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v4.4h, v26.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v5.4h, v14.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn           v6.4h, v19.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn           v7.4h, v23.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn           v8.4h, v27.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn           v9.4h, v15.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v2.8h, v16.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v3.8h, v28.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v4.8h, v24.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v5.8h, v20.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
+    rshrn2          v6.8h, v17.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
+    rshrn2          v7.8h, v29.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
+    rshrn2          v8.8h, v25.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
+    rshrn2          v9.8h, v21.4s, #(CONST_BITS - PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
     b               1b
 
     .unreq          DCT_TABLE
@@ -767,665 +843,6 @@ asm_function jsimd_idct_islow_neon
 #undef XFIX_P_3_072
 
 
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200 v0.h[0]
-#define XFIX_1_414213562 v0.h[1]
-#define XFIX_1_847759065 v0.h[2]
-#define XFIX_2_613125930 v0.h[3]
-
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x9
-    TMP4            .req x10
-    TMP5            .req x11
-    TMP6            .req x12
-    TMP7            .req x13
-    TMP8            .req x14
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Load and dequantize coefficients into NEON registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( v16.8h )
-     *   1 | d18     | d19     ( v17.8h )
-     *   2 | d20     | d21     ( v18.8h )
-     *   3 | d22     | d23     ( v19.8h )
-     *   4 | d24     | d25     ( v20.8h )
-     *   5 | d26     | d27     ( v21.8h )
-     *   6 | d28     | d29     ( v22.8h )
-     *   7 | d30     | d31     ( v23.8h )
-     */
-    /* Save NEON registers used in fast IDCT */
-    adr             TMP5, Ljsimd_idct_ifast_neon_consts
-    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
-    mul             v16.8h, v16.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v17.8h, v17.8h, v1.8h
-    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
-    mul             v18.8h, v18.8h, v2.8h
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    mul             v19.8h, v19.8h, v3.8h
-    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
-    mul             v20.8h, v20.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v22.8h, v22.8h, v2.8h
-    mul             v21.8h, v21.8h, v1.8h
-    ld1             {v0.4h}, [TMP5]        /* load constants */
-    mul             v23.8h, v23.8h, v3.8h
-
-    /* 1-D IDCT, pass 1 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
-    /* 1-D IDCT, pass 2 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    /* Descale to 8-bit and range limit */
-    movi            v0.16b, #0x80
-      /* Prepare pointers (dual-issue with NEON instructions) */
-      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    sqshrn          v28.8b, v16.8h, #5
-      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
-    sqshrn          v29.8b, v17.8h, #5
-      add             TMP1, TMP1, OUTPUT_COL
-    sqshrn          v30.8b, v18.8h, #5
-      add             TMP2, TMP2, OUTPUT_COL
-    sqshrn          v31.8b, v19.8h, #5
-      add             TMP3, TMP3, OUTPUT_COL
-    sqshrn2         v28.16b, v20.8h, #5
-      add             TMP4, TMP4, OUTPUT_COL
-    sqshrn2         v29.16b, v21.8h, #5
-      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
-    sqshrn2         v30.16b, v22.8h, #5
-      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
-    sqshrn2         v31.16b, v23.8h, #5
-      add             TMP5, TMP5, OUTPUT_COL
-    add             v16.16b, v28.16b, v0.16b
-      add             TMP6, TMP6, OUTPUT_COL
-    add             v18.16b, v29.16b, v0.16b
-      add             TMP7, TMP7, OUTPUT_COL
-    add             v20.16b, v30.16b, v0.16b
-      add             TMP8, TMP8, OUTPUT_COL
-    add             v22.16b, v31.16b, v0.16b
-
-    /* Transpose the final 8-bit samples */
-    trn1            v28.16b, v16.16b, v18.16b
-    trn1            v30.16b, v20.16b, v22.16b
-    trn2            v29.16b, v16.16b, v18.16b
-    trn2            v31.16b, v20.16b, v22.16b
-
-    trn1            v16.8h, v28.8h, v30.8h
-    trn2            v18.8h, v28.8h, v30.8h
-    trn1            v20.8h, v29.8h, v31.8h
-    trn2            v22.8h, v29.8h, v31.8h
-
-    uzp1            v28.4s, v16.4s, v18.4s
-    uzp2            v30.4s, v16.4s, v18.4s
-    uzp1            v29.4s, v20.4s, v22.4s
-    uzp2            v31.4s, v20.4s, v22.4s
-
-    /* Store results to the output buffer */
-    st1             {v28.d}[0], [TMP1]
-    st1             {v29.d}[0], [TMP2]
-    st1             {v28.d}[1], [TMP3]
-    st1             {v29.d}[1], [TMP4]
-    st1             {v30.d}[0], [TMP5]
-    st1             {v31.d}[0], [TMP6]
-    st1             {v30.d}[1], [TMP7]
-    st1             {v31.d}[1], [TMP8]
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065      /* v0.h[0] */
-  .short -FIX_0_765366865     /* v0.h[1] */
-  .short -FIX_0_211164243     /* v0.h[2] */
-  .short FIX_1_451774981      /* v0.h[3] */
-  .short -FIX_2_172734803     /* d1[0] */
-  .short FIX_1_061594337      /* d1[1] */
-  .short -FIX_0_509795579     /* d1[2] */
-  .short -FIX_0_601344887     /* d1[3] */
-  .short FIX_0_899976223      /* v2.h[0] */
-  .short FIX_2_562915447      /* v2.h[1] */
-  .short 1 << (CONST_BITS+1)  /* v2.h[2] */
-  .short 0                    /* v2.h[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4, v2.h[2]
-    smlal           v28.4s, \x8, v0.h[0]
-    smlal           v28.4s, \x14, v0.h[1]
-
-    smull           v26.4s, \x16, v1.h[2]
-    smlal           v26.4s, \x12, v1.h[3]
-    smlal           v26.4s, \x10, v2.h[0]
-    smlal           v26.4s, \x6, v2.h[1]
-
-    smull           v30.4s, \x4, v2.h[2]
-    smlsl           v30.4s, \x8, v0.h[0]
-    smlsl           v30.4s, \x14, v0.h[1]
-
-    smull           v24.4s, \x16, v0.h[2]
-    smlal           v24.4s, \x12, v0.h[3]
-    smlal           v24.4s, \x10, v1.h[0]
-    smlal           v24.4s, \x6, v1.h[1]
-
-    add             v20.4s, v28.4s, v26.4s
-    sub             v28.4s, v28.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y29, v28.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y29, v28.4s, #\shift
-  .endif
-
-    add             v20.4s, v30.4s, v24.4s
-    sub             v30.4s, v30.4s, v24.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27, v20.4s
-    xtn             \y28, v30.4s
-  .else
-    rshrn           \y27, v20.4s, #\shift
-    rshrn           \y28, v30.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Save all used NEON registers */
-    sub             sp, sp, 64
-    mov             x9, sp
-    /* Load constants (v3.4h is just used for padding) */
-    adr             TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | v8.4h   | v9.4h
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | v14.4h  | v15.4h
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
-    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
-    mul             v8.4h, v8.4h, v22.4h
-    mul             v9.4h, v9.4h, v23.4h
-    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v14.4h, v14.4h, v28.4h
-    mul             v15.4h, v15.4h, v29.4h
-    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
-
-    /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
-                    v4.4h, v6.4h, v8.4h, v10.4h
-    transpose_4x4   v4, v6, v8, v10, v3
-    ins             v10.d[1], v11.d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
-                    v5.4h, v7.4h, v9.4h, v11.4h
-    transpose_4x4   v5, v7, v9, v11, v3
-    ins             v10.d[1], v11.d[0]
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
-                    v26.4h, v27.4h, v28.4h, v29.4h
-    transpose_4x4   v26, v27, v28, v29, v3
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    ins             v28.d[1], v29.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    add             v28.8h, v28.8h, v30.8h
-    sqxtun          v26.8b, v26.8h
-    sqxtun          v27.8b, v28.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    ldp             TMP3, TMP4, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    st1             {v26.s}[0], [TMP1], 4
-    st1             {v27.s}[0], [TMP3], 4
-    st1             {v26.s}[1], [TMP2], 4
-    st1             {v27.s}[1], [TMP4], 4
-#else
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[0], [TMP3], 1
-    st1             {v26.b}[1], [TMP1], 1
-    st1             {v27.b}[1], [TMP3], 1
-    st1             {v26.b}[2], [TMP1], 1
-    st1             {v27.b}[2], [TMP3], 1
-    st1             {v26.b}[3], [TMP1], 1
-    st1             {v27.b}[3], [TMP3], 1
-
-    st1             {v26.b}[4], [TMP2], 1
-    st1             {v27.b}[4], [TMP4], 1
-    st1             {v26.b}[5], [TMP2], 1
-    st1             {v27.b}[5], [TMP4], 1
-    st1             {v26.b}[6], [TMP2], 1
-    st1             {v27.b}[6], [TMP4], 1
-    st1             {v26.b}[7], [TMP2], 1
-    st1             {v27.b}[7], [TMP4], 1
-#endif
-
-    /* vpop            {v8.4h - v15.4h}    ;not available */
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* v14[0] */
-  .short FIX_0_850430095   /* v14[1] */
-  .short -FIX_1_272758580  /* v14[2] */
-  .short FIX_3_624509785   /* v14[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll           v15.4s, \x4, #15
-    smull           v26.4s, \x6, v14.h[3]
-    smlal           v26.4s, \x10, v14.h[2]
-    smlal           v26.4s, \x12, v14.h[1]
-    smlal           v26.4s, \x16, v14.h[0]
-
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v15.4s, v15.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y27, v15.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y27, v15.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* vpush           {v8.4h - v15.4h}            ; not available */
-    sub             sp, sp, 64
-    mov             x9, sp
-
-    /* Load constants */
-    adr             TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v14.4h}, [TMP2]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | -       | -
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | -       | -
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* Dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]
-
-    /* Pass 1 */
-#if 0
-    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
-    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
-#else
-    smull           v26.4s, v6.4h, v14.h[3]
-    smlal           v26.4s, v10.4h, v14.h[2]
-    smlal           v26.4s, v12.4h, v14.h[1]
-    smlal           v26.4s, v16.4h, v14.h[0]
-    smull           v24.4s, v7.4h, v14.h[3]
-    smlal           v24.4s, v11.4h, v14.h[2]
-    smlal           v24.4s, v13.4h, v14.h[1]
-    smlal           v24.4s, v17.4h, v14.h[0]
-    sshll           v15.4s, v4.4h, #15
-    sshll           v30.4s, v5.4h, #15
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h, v20.4s, #13
-    rshrn           v6.4h, v15.4s, #13
-    add             v20.4s, v30.4s, v24.4s
-    sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h, v20.4s, #13
-    rshrn           v7.4h, v15.4s, #13
-    ins             v4.d[1], v5.d[0]
-    ins             v6.d[1], v7.d[0]
-    transpose       v4, v6, v3, .16b, .8h
-    transpose       v6, v10, v3, .16b, .4s
-    ins             v11.d[0], v10.d[1]
-    ins             v7.d[0], v6.d[1]
-#endif
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    sqxtun          v30.8b, v26.8h
-    ins             v26.d[0], v30.d[0]
-    sqxtun          v27.8b, v26.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[4], [TMP1], 1
-    st1             {v26.b}[1], [TMP2], 1
-    st1             {v27.b}[5], [TMP2], 1
-
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
 /*****************************************************************************/
 
 /*
@@ -1543,7 +960,7 @@ asm_function jsimd_idct_2x2_neon
     .else
       .error unsupported macroblock size
     .endif
-  .elseif \bpp==16
+  .elseif \bpp == 16
     .if \size == 8
       st1           {v25.8h}, [RGB], 16
     .elseif \size == 4
@@ -1662,21 +1079,6 @@ asm_function jsimd_idct_2x2_neon
     do_yuv_to_rgb_stage2
 .endm
 
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-.if \fast_st3 == 1
-Ljsimd_ycc_\colorid\()_neon_consts:
-.else
-Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
-.endif
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
 .if \fast_st3 == 1
 asm_function jsimd_ycc_\colorid\()_convert_neon
 .else
@@ -1702,13 +1104,9 @@ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
     mov             x9, sp
 
     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
-    .if \fast_st3 == 1
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
-    .else
-      adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
-    .endif
+    get_symbol_loc  x15, Ljsimd_ycc_rgb_neon_consts
 
-    /* Save NEON registers */
+    /* Save Neon registers */
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
     ld1             {v0.4h, v1.4h}, [x15], 16
@@ -1993,7 +1391,7 @@ generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,
 .endm
 
 /* TODO: expand macros and interleave instructions if some in-order
- *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
+ *       AArch64 processor actually can dual-issue LOAD/STORE with ALU */
 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
     do_rgb_to_yuv_stage2
     do_load         \bpp, 8, \fast_ld3
@@ -2003,17 +1401,6 @@ generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,
     do_rgb_to_yuv_stage1
 .endm
 
-.balign 16
-.if \fast_ld3 == 1
-Ljsimd_\colorid\()_ycc_neon_consts:
-.else
-Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
-.endif
-  .short 19595, 38470, 7471, 11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128, 32767, 128
-  .short 32767, 128, 32767, 128
-
 .if \fast_ld3 == 1
 asm_function jsimd_\colorid\()_ycc_convert_neon
 .else
@@ -2036,11 +1423,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     N               .req w12
 
     /* Load constants to d0, d1, d2, d3 */
-    .if \fast_ld3 == 1
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
-    .else
-      adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
-    .endif
+    get_symbol_loc  x13, Ljsimd_rgb_ycc_neon_consts
     ld1             {v0.8h, v1.8h}, [x13]
 
     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
@@ -2048,7 +1431,7 @@ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
     .unreq          OUTPUT_BUF
 
-    /* Save NEON registers */
+    /* Save Neon registers */
     sub             sp, sp, #64
     mov             x9, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
@@ -2144,88 +1527,12 @@ generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
 .purgem do_store
 
 
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req x0
-    START_COL       .req x1
-    WORKSPACE       .req x2
-    TMP1            .req x9
-    TMP2            .req x10
-    TMP3            .req x11
-    TMP4            .req x12
-    TMP5            .req x13
-    TMP6            .req x14
-    TMP7            .req x15
-    TMP8            .req x4
-    TMPDUP          .req w3
-
-    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x1 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x1, w1
-
-    mov             TMPDUP, #128
-    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
-    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
-    dup             v0.8b, TMPDUP
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
-    add             TMP5, TMP5, START_COL
-    add             TMP6, TMP6, START_COL
-    ld1             {v16.8b}, [TMP1]
-    add             TMP7, TMP7, START_COL
-    add             TMP8, TMP8, START_COL
-    ld1             {v17.8b}, [TMP2]
-    usubl           v16.8h, v16.8b, v0.8b
-    ld1             {v18.8b}, [TMP3]
-    usubl           v17.8h, v17.8b, v0.8b
-    ld1             {v19.8b}, [TMP4]
-    usubl           v18.8h, v18.8b, v0.8b
-    ld1             {v20.8b}, [TMP5]
-    usubl           v19.8h, v19.8b, v0.8b
-    ld1             {v21.8b}, [TMP6]
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
-    usubl           v20.8h, v20.8b, v0.8b
-    ld1             {v22.8b}, [TMP7]
-    usubl           v21.8h, v21.8b, v0.8b
-    ld1             {v23.8b}, [TMP8]
-    usubl           v22.8h, v22.8b, v0.8b
-    usubl           v23.8h, v23.8b, v0.8b
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
-    br              x30
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-    .unreq          TMPDUP
-
 /*****************************************************************************/
 
 /*
  * jsimd_fdct_islow_neon
  *
- * This file contains a slow-but-accurate integer implementation of the
+ * This file contains a slower but more accurate integer implementation of the
  * forward DCT (Discrete Cosine Transform). The following code is based
  * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
  * more details.
@@ -2234,68 +1541,24 @@ asm_function jsimd_convsamp_neon
  *       rid of a bunch of VLD1.16 instructions
  */
 
-#define CONST_BITS 13
-#define PASS1_BITS 2
+#define CONST_BITS  13
+#define PASS1_BITS  2
 
-#define DESCALE_P1 (CONST_BITS-PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS+PASS1_BITS)
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
 
-#define F_0_298  2446  /* FIX(0.298631336) */
-#define F_0_390  3196  /* FIX(0.390180644) */
-#define F_0_541  4433  /* FIX(0.541196100) */
-#define F_0_765  6270  /* FIX(0.765366865) */
-#define F_0_899  7373  /* FIX(0.899976223) */
-#define F_1_175  9633  /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-.balign 16
-Ljsimd_fdct_islow_neon_consts:
-  .short F_0_298
-  .short -F_0_390
-  .short F_0_541
-  .short F_0_765
-  .short - F_0_899
-  .short F_1_175
-  .short F_1_501
-  .short - F_1_847
-  .short - F_1_961
-  .short F_2_053
-  .short - F_2_562
-  .short F_3_072
-  .short 0          /* padding */
-  .short 0
-  .short 0
-  .short 0
-
-#undef F_0_298
-#undef F_0_390
-#undef F_0_541
-#undef F_0_765
-#undef F_0_899
-#undef F_1_175
-#undef F_1_501
-#undef F_1_847
-#undef F_1_961
-#undef F_2_053
-#undef F_2_562
-#undef F_3_072
-#define XFIX_P_0_298 v0.h[0]
-#define XFIX_N_0_390 v0.h[1]
-#define XFIX_P_0_541 v0.h[2]
-#define XFIX_P_0_765 v0.h[3]
-#define XFIX_N_0_899 v0.h[4]
-#define XFIX_P_1_175 v0.h[5]
-#define XFIX_P_1_501 v0.h[6]
-#define XFIX_N_1_847 v0.h[7]
-#define XFIX_N_1_961 v1.h[0]
-#define XFIX_P_2_053 v1.h[1]
-#define XFIX_N_2_562 v1.h[2]
-#define XFIX_P_3_072 v1.h[3]
+#define XFIX_P_0_298  v0.h[0]
+#define XFIX_N_0_390  v0.h[1]
+#define XFIX_P_0_541  v0.h[2]
+#define XFIX_P_0_765  v0.h[3]
+#define XFIX_N_0_899  v0.h[4]
+#define XFIX_P_1_175  v0.h[5]
+#define XFIX_P_1_501  v0.h[6]
+#define XFIX_N_1_847  v0.h[7]
+#define XFIX_N_1_961  v1.h[0]
+#define XFIX_P_2_053  v1.h[1]
+#define XFIX_N_2_562  v1.h[2]
+#define XFIX_P_3_072  v1.h[3]
 
 asm_function jsimd_fdct_islow_neon
 
@@ -2303,16 +1566,16 @@ asm_function jsimd_fdct_islow_neon
     TMP             .req x9
 
     /* Load constants */
-    adr             TMP, Ljsimd_fdct_islow_neon_consts
+    get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
     ld1             {v0.8h, v1.8h}, [TMP]
 
-    /* Save NEON registers */
+    /* Save Neon registers */
     sub             sp, sp, #64
     mov             x10, sp
     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
 
-    /* Load all DATA into NEON registers with the following allocation:
+    /* Load all DATA into Neon registers with the following allocation:
      *       0 1 2 3 | 4 5 6 7
      *      ---------+--------
      *   0 | d16     | d17    | v16.8h
@@ -2353,8 +1616,8 @@ asm_function jsimd_fdct_islow_neon
 
     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
-    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
+    shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
+    shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
 
     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2368,8 +1631,8 @@ asm_function jsimd_fdct_islow_neon
 
     rshrn           v18.4h, v18.4s, #DESCALE_P1
     rshrn           v22.4h, v22.4s, #DESCALE_P1
-    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+    rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
 
@@ -2395,10 +1658,10 @@ asm_function jsimd_fdct_islow_neon
     smull2          v13.4s, v9.8h, XFIX_N_2_562
     smull2          v14.4s, v10.8h, XFIX_N_1_961
     smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
 
     add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
     add             v14.4s, v14.4s, v5.4s
@@ -2427,10 +1690,10 @@ asm_function jsimd_fdct_islow_neon
     rshrn           v21.4h, v29.4s, #DESCALE_P1
     rshrn           v19.4h, v30.4s, #DESCALE_P1
     rshrn           v17.4h, v31.4s, #DESCALE_P1
-    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* Transpose */
     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
@@ -2456,8 +1719,8 @@ asm_function jsimd_fdct_islow_neon
 
     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
 
-    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
-    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
+    srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
+    srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
 
     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
@@ -2471,8 +1734,8 @@ asm_function jsimd_fdct_islow_neon
 
     rshrn           v18.4h, v18.4s, #DESCALE_P2
     rshrn           v22.4h, v22.4s, #DESCALE_P2
-    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
-    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
+    rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
+    rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
 
     /* Odd part */
     add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
@@ -2498,10 +1761,10 @@ asm_function jsimd_fdct_islow_neon
     smull2          v13.4s, v9.8h, XFIX_N_2_562
     smull2          v14.4s, v10.8h, XFIX_N_1_961
     smull2          v15.4s, v11.8h, XFIX_N_0_390
-    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
-    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
-    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
-    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
+    smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
+    smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
+    smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
+    smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
 
     add             v10.4s, v10.4s, v4.4s
     add             v14.4s, v14.4s, v5.4s
@@ -2530,16 +1793,16 @@ asm_function jsimd_fdct_islow_neon
     rshrn           v21.4h, v29.4s, #DESCALE_P2
     rshrn           v19.4h, v30.4s, #DESCALE_P2
     rshrn           v17.4h, v31.4s, #DESCALE_P2
-    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
-    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
-    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
+    rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
+    rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
 
     /* store results */
     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
 
-    /* Restore NEON registers */
+    /* Restore Neon registers */
     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
 
@@ -2565,405 +1828,10 @@ asm_function jsimd_fdct_islow_neon
 /*****************************************************************************/
 
 /*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#undef XFIX_0_541196100
-#define XFIX_0_382683433 v0.h[0]
-#define XFIX_0_541196100 v0.h[1]
-#define XFIX_0_707106781 v0.h[2]
-#define XFIX_1_306562965 v0.h[3]
-
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req x0
-    TMP             .req x9
-
-    /* Load constants */
-    adr             TMP, Ljsimd_fdct_ifast_neon_consts
-    ld1             {v0.4h}, [TMP]
-
-    /* Load all DATA into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | v0.8h
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-    mov             TMP, #2
-    sub             DATA, DATA, #64
-1:
-    /* Transpose */
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
-    subs            TMP, TMP, #1
-    /* 1-D FDCT */
-    add             v4.8h, v19.8h, v20.8h
-    sub             v20.8h, v19.8h, v20.8h
-    sub             v28.8h, v18.8h, v21.8h
-    add             v18.8h, v18.8h, v21.8h
-    sub             v29.8h, v17.8h, v22.8h
-    add             v17.8h, v17.8h, v22.8h
-    sub             v21.8h, v16.8h, v23.8h
-    add             v16.8h, v16.8h, v23.8h
-    sub             v6.8h, v17.8h, v18.8h
-    sub             v7.8h, v16.8h, v4.8h
-    add             v5.8h, v17.8h, v18.8h
-    add             v6.8h, v6.8h, v7.8h
-    add             v4.8h, v16.8h, v4.8h
-    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
-    add             v19.8h, v20.8h, v28.8h
-    add             v16.8h, v4.8h, v5.8h
-    sub             v20.8h, v4.8h, v5.8h
-    add             v5.8h, v28.8h, v29.8h
-    add             v29.8h, v29.8h, v21.8h
-    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
-    sub             v28.8h, v19.8h, v29.8h
-    add             v18.8h, v7.8h, v6.8h
-    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
-    sub             v22.8h, v7.8h, v6.8h
-    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
-    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
-    add             v6.8h, v21.8h, v5.8h
-    sub             v5.8h, v21.8h, v5.8h
-    add             v29.8h, v29.8h, v28.8h
-    add             v19.8h, v19.8h, v28.8h
-    add             v29.8h, v29.8h, v7.8h
-    add             v21.8h, v5.8h, v19.8h
-    sub             v19.8h, v5.8h, v19.8h
-    add             v17.8h, v6.8h, v29.8h
-    sub             v23.8h, v6.8h, v29.8h
-
-    b.ne            1b
-
-    /* store results */
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
-    br              x30
-
-    .unreq          DATA
-    .unreq          TMP
-#undef XFIX_0_382683433
-#undef XFIX_0_541196100
-#undef XFIX_0_707106781
-#undef XFIX_1_306562965
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- *                      DCTELEM *workspace);
- *
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req x0
-    DIVISORS        .req x1
-    WORKSPACE       .req x2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req x9
-    SHIFT           .req x10
-    LOOP_COUNT      .req x11
-
-    mov             LOOP_COUNT, #2
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-1:
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
-    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
-    abs             v20.8h, v0.8h
-    abs             v21.8h, v1.8h
-    abs             v22.8h, v2.8h
-    abs             v23.8h, v3.8h
-    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
-    add             v20.8h, v20.8h, v4.8h  /* add correction */
-    add             v21.8h, v21.8h, v5.8h
-    add             v22.8h, v22.8h, v6.8h
-    add             v23.8h, v23.8h, v7.8h
-    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
-    umull2          v16.4s, v20.8h, v28.8h
-    umull           v5.4s, v21.4h, v29.4h
-    umull2          v17.4s, v21.8h, v29.8h
-    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
-    umull2          v18.4s, v22.8h, v30.8h
-    umull           v7.4s, v23.4h, v31.4h
-    umull2          v19.4s, v23.8h, v31.8h
-    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
-    shrn            v4.4h, v4.4s, #16
-    shrn            v5.4h, v5.4s, #16
-    shrn            v6.4h, v6.4s, #16
-    shrn            v7.4h, v7.4s, #16
-    shrn2           v4.8h, v16.4s, #16
-    shrn2           v5.8h, v17.4s, #16
-    shrn2           v6.8h, v18.4s, #16
-    shrn2           v7.8h, v19.4s, #16
-    neg             v24.8h, v24.8h
-    neg             v25.8h, v25.8h
-    neg             v26.8h, v26.8h
-    neg             v27.8h, v27.8h
-    sshr            v0.8h, v0.8h, #15  /* extract sign */
-    sshr            v1.8h, v1.8h, #15
-    sshr            v2.8h, v2.8h, #15
-    sshr            v3.8h, v3.8h, #15
-    ushl            v4.8h, v4.8h, v24.8h  /* shift */
-    ushl            v5.8h, v5.8h, v25.8h
-    ushl            v6.8h, v6.8h, v26.8h
-    ushl            v7.8h, v7.8h, v27.8h
-
-    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
-    eor             v5.16b, v5.16b, v1.16b
-    eor             v6.16b, v6.16b, v2.16b
-    eor             v7.16b, v7.16b, v3.16b
-    sub             v4.8h, v4.8h, v0.8h
-    sub             v5.8h, v5.8h, v1.8h
-    sub             v6.8h, v6.8h, v2.8h
-    sub             v7.8h, v7.8h, v3.8h
-    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
-
-    b.ne            1b
-
-    br              x30  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 1:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- *                             JDIMENSION v_samp_factor,
- *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
- *                             JSAMPARRAY output_data);
- */
-
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
-
-asm_function jsimd_h2v1_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR           .req x10
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #0x10000
-    lsl             TMP2, BLOCK_WIDTH, #4
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR]
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 2:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
- *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
- *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-.balign 16
-asm_function jsimd_h2v2_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR0          .req x10
-    INPTR1          .req x14
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #1
-    lsl             TMP2, BLOCK_WIDTH, #4
-    lsl             TMPDUP, TMPDUP, #17
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    adr             TMP3, Ljsimd_h2_downsample_neon_consts
-    orr             TMPDUP, TMPDUP, #1
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR0, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    ldr             INPTR1, [INPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    uadalp          v4.8h, v1.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    tbl             v3.16b, {v1.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    uadalp          v4.8h, v3.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR0
-    .unreq          INPTR1
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- *                              JCOEFPTR block, int last_dc_val,
- *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+ * GLOBAL(JOCTET *)
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
+ *                             JCOEFPTR block, int last_dc_val,
+ *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
  *
  */
 
@@ -3010,41 +1878,6 @@ asm_function jsimd_h2v2_downsample_neon
 
 .macro generate_jsimd_huff_encode_one_block fast_tbl
 
-.balign 16
-.if \fast_tbl == 1
-Ljsimd_huff_encode_one_block_neon_consts:
-.else
-Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
-.endif
-    .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
-          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
-.if \fast_tbl == 1
-    .byte    0,   1,   2,   3,  16,  17,  32,  33, \
-            18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
-    .byte   34,  35,  48,  49, 255, 255,  50,  51, \
-            36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
-    .byte    8,   9,  22,  23,  36,  37,  50,  51, \
-           255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
-    .byte   54,  55,  40,  41,  26,  27,  12,  13, \
-            14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
-    .byte    6,   7,  20,  21,  34,  35,  48,  49, \
-            50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
-    .byte   42,  43,  28,  29,  14,  15,  30,  31, \
-            44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
-    .byte  255, 255, 255, 255,  56,  57,  42,  43, \
-            28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
-    .byte   26,  27,  40,  41,  42,  43,  28,  29, \
-            14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
-    .byte  255, 255, 255, 255,   0,   1, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-             0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
-    .byte  255, 255, 255, 255, 255, 255, 255, 255, \
-           255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
-    .byte    4,   5,   6,   7, 255, 255, 255, 255, \
-           255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
-.endif
-
 .if \fast_tbl == 1
 asm_function jsimd_huff_encode_one_block_neon
 .else
@@ -3052,13 +1885,9 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
 .endif
     sub             sp, sp, 272
     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
-    /* Save ARM registers */
+    /* Save Arm registers */
     stp             x19, x20, [sp]
-.if \fast_tbl == 1
-    adr             x15, Ljsimd_huff_encode_one_block_neon_consts
-.else
-    adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
-.endif
+    get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
     ldr             PUT_BUFFER, [x0, #0x10]
     ldr             PUT_BITSw, [x0, #0x18]
     ldrsh           w12, [x2]               /* load DC coeff in w12 */
@@ -3278,7 +2107,7 @@ asm_function jsimd_huff_encode_one_block_neon_slowtbl
       put_bits        x10, x11
     addp            v16.16b, v16.16b, v18.16b
       checkbuf47
-    umov            x9,v16.D[0]
+    umov            x9, v16.D[0]
       put_bits        x13, x12
     cnt             v17.8b, v16.8b
       mvn             x9, x9
diff --git a/media/libjpeg/simd/arm/align.h b/media/libjpeg/simd/arm/align.h
new file mode 100644
index 0000000000..cff4241e84
--- /dev/null
+++ b/media/libjpeg/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment)  __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment)  __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/arm/jccolor-neon.c b/media/libjpeg/simd/arm/jccolor-neon.c
new file mode 100644
index 0000000000..9fcc62dd25
--- /dev/null
+++ b/media/libjpeg/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+#define F_0_168  11059
+#define F_0_331  21709
+#define F_0_500  32768
+#define F_0_418  27439
+#define F_0_081  5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+  F_0_298, F_0_587, F_0_113, F_0_168,
+  F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgray-neon.c b/media/libjpeg/simd/arm/jcgray-neon.c
new file mode 100644
index 0000000000..71c7b2de21
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/media/libjpeg/simd/arm/jcgryext-neon.c b/media/libjpeg/simd/arm/jcgryext-neon.c
new file mode 100644
index 0000000000..416a7385df
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                 int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 16) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 16) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+      uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+      uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+      uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+
+      /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
+       * permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr += 16;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jchuff.h b/media/libjpeg/simd/arm/jchuff.h
new file mode 100644
index 0000000000..2fbd252b9b
--- /dev/null
+++ b/media/libjpeg/simd/arm/jchuff.h
@@ -0,0 +1,131 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE  64
+#else
+#define BIT_BUF_SIZE  32
+#endif
+
+typedef struct {
+  size_t put_buffer;                    /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+    buffer += 8; \
+  } \
+}
+
+#else
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SPLAT() { \
+  buffer[0] = (JOCTET)(put_buffer >> 24); \
+  buffer[1] = (JOCTET)(put_buffer >> 16); \
+  buffer[2] = (JOCTET)(put_buffer >>  8); \
+  buffer[3] = (JOCTET)(put_buffer      ); \
+  buffer += 4; \
+}
+#else
+#define SPLAT() { \
+  put_buffer = __builtin_bswap32(put_buffer); \
+  __asm__("str %1, [%0], #4" : "+r" (buffer) : "r" (put_buffer)); \
+}
+#endif
+
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    SPLAT(); \
+  } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+  diff |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(diff, nbits) \
+}
diff --git a/media/libjpeg/simd/arm/jcphuff-neon.c b/media/libjpeg/simd/arm/jcphuff-neon.c
new file mode 100644
index 0000000000..b91c5db478
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcphuff-neon.c
@@ -0,0 +1,622 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits)
+{
+  JCOEF *values_ptr = values;
+  JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs);
+    vst1q_s16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(values_ptr, vdupq_n_s16(0));
+    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  JCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - BUILTIN_CLZLL(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - BUILTIN_CLZ(bitmap1);
+  } else {
+    return 31 - BUILTIN_CLZ(bitmap0);
+  }
+#endif
+}
diff --git a/media/libjpeg/simd/arm/jcsample-neon.c b/media/libjpeg/simd/arm/jcsample-neon.c
new file mode 100644
index 0000000000..8a3e237838
--- /dev/null
+++ b/media/libjpeg/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
+  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
+  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
+  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
+  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
+  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
+  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
+  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
+  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+      /* Divide total by 2 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+      /* Store samples to memory. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+                         vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+    /* Divide total by 2, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr0, inptr1, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr0 = input_data[outrow];
+    inptr1 = input_data[outrow + 1];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+       */
+      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+      /* Divide total by 4 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+      /* Store samples to memory and increment pointers. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels_r0 =
+      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+    uint8x16_t pixels_r1 =
+      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+    uint8x8x2_t table_r0 =
+      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+    uint8x8x2_t table_r1 =
+      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+    /* Divide total by 4, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdcolext-neon.c b/media/libjpeg/simd/arm/jdcolext-neon.c
new file mode 100644
index 0000000000..c3c07a1964
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolext-neon.c
@@ -0,0 +1,374 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+                                JDIMENSION input_row, JSAMPARRAY output_buf,
+                                int num_rows)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    int cols_remaining = output_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+      uint8x16_t y  = vld1q_u8(inptr0);
+      uint8x16_t cb = vld1q_u8(inptr1);
+      uint8x16_t cr = vld1q_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cr)));
+      int16x8_t cr_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cr)));
+      int16x8_t cb_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cb)));
+      int16x8_t cb_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cb)));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+                                            consts, 0);
+      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+                                            consts, 0);
+      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+                                  consts, 1);
+      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+                                  consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+                                         vrshrn_n_s32(g_sub_y_lh, 15));
+      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+                                         vrshrn_n_s32(g_sub_y_hh, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+                                               consts, 2);
+      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+                                               consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+                                               consts, 3);
+      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+                                               consts, 3);
+      /* Add Y. */
+      int16x8_t r_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t r_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t b_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t b_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t g_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t g_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+                                       vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x16x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Store RGB pixel data to memory. */
+      vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 16;
+      inptr1 += 16;
+      inptr2 += 16;
+      outptr += (RGB_PIXELSIZE * 16);
+    }
+
+    if (cols_remaining >= 8) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 8;
+      inptr1 += 8;
+      inptr2 += 8;
+      outptr += (RGB_PIXELSIZE * 8);
+      cols_remaining -= 8;
+    }
+
+    /* Handle the tail elements. */
+    if (cols_remaining > 0) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst4_lane_u8(outptr, rgba, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst3_lane_u8(outptr, rgb, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+        FALLTHROUGH             /*FALLTHROUGH*/
+      default:
+        break;
+      }
+#endif
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdcolor-neon.c b/media/libjpeg/simd/arm/jdcolor-neon.c
new file mode 100644
index 0000000000..ea4668f1d3
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdcolor-neon.c
@@ -0,0 +1,142 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/media/libjpeg/simd/arm/jdmerge-neon.c b/media/libjpeg/simd/arm/jdmerge-neon.c
new file mode 100644
index 0000000000..e4f91fdc0e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmerge-neon.c
@@ -0,0 +1,145 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/media/libjpeg/simd/arm/jdmrgext-neon.c b/media/libjpeg/simd/arm/jdmrgext-neon.c
new file mode 100644
index 0000000000..5b89bdb339
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,723 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba;
+    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr, rgba);
+#else
+    uint8x16x3_t rgb;
+    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr, rgb);
+#endif
+
+    /* Increment pointers. */
+    inptr0 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba_h;
+    rgba_h.val[RGB_RED] = r.val[1];
+    rgba_h.val[RGB_GREEN] = g.val[1];
+    rgba_h.val[RGB_BLUE] = b.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    uint8x8x4_t rgba_l;
+    rgba_l.val[RGB_RED] = r.val[0];
+    rgba_l.val[RGB_GREEN] = g.val[0];
+    rgba_l.val[RGB_BLUE] = b.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr, rgba_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr, rgba_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb_h;
+    rgb_h.val[RGB_RED] = r.val[1];
+    rgb_h.val[RGB_GREEN] = g.val[1];
+    rgb_h.val[RGB_BLUE] = b.val[1];
+    uint8x8x3_t rgb_l;
+    rgb_l.val[RGB_RED] = r.val[0];
+    rgb_l.val[RGB_GREEN] = g.val[0];
+    rgb_l.val[RGB_BLUE] = b.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr, rgb_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr, rgb_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr0, outptr1;
+  /* Pointers to Y (both rows), Cb, and Cr data */
+  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba0, rgba1;
+    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr0, rgba0);
+    vst4q_u8(outptr1, rgba1);
+#else
+    uint8x16x3_t rgb0, rgb1;
+    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr0, rgb0);
+    vst3q_u8(outptr1, rgb1);
+#endif
+
+    /* Increment pointers. */
+    inptr0_0 += 16;
+    inptr0_1 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr0 += (RGB_PIXELSIZE * 16);
+    outptr1 += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba0_h, rgba1_h;
+    rgba0_h.val[RGB_RED] = r0.val[1];
+    rgba1_h.val[RGB_RED] = r1.val[1];
+    rgba0_h.val[RGB_GREEN] = g0.val[1];
+    rgba1_h.val[RGB_GREEN] = g1.val[1];
+    rgba0_h.val[RGB_BLUE] = b0.val[1];
+    rgba1_h.val[RGB_BLUE] = b1.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+    uint8x8x4_t rgba0_l, rgba1_l;
+    rgba0_l.val[RGB_RED] = r0.val[0];
+    rgba1_l.val[RGB_RED] = r1.val[0];
+    rgba0_l.val[RGB_GREEN] = g0.val[0];
+    rgba1_l.val[RGB_GREEN] = g1.val[0];
+    rgba0_l.val[RGB_BLUE] = b0.val[0];
+    rgba1_l.val[RGB_BLUE] = b1.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst4_u8(outptr0, rgba0_l);
+      vst4_u8(outptr1, rgba1_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst4_lane_u8(outptr0, rgba0_l, 0);
+      vst4_lane_u8(outptr1, rgba1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb0_h, rgb1_h;
+    rgb0_h.val[RGB_RED] = r0.val[1];
+    rgb1_h.val[RGB_RED] = r1.val[1];
+    rgb0_h.val[RGB_GREEN] = g0.val[1];
+    rgb1_h.val[RGB_GREEN] = g1.val[1];
+    rgb0_h.val[RGB_BLUE] = b0.val[1];
+    rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+    uint8x8x3_t rgb0_l, rgb1_l;
+    rgb0_l.val[RGB_RED] = r0.val[0];
+    rgb1_l.val[RGB_RED] = r1.val[0];
+    rgb0_l.val[RGB_GREEN] = g0.val[0];
+    rgb1_l.val[RGB_GREEN] = g1.val[0];
+    rgb0_l.val[RGB_BLUE] = b0.val[0];
+    rgb1_l.val[RGB_BLUE] = b1.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 14:
+      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 13:
+      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 12:
+      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 11:
+      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 10:
+      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 9:
+      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 8:
+      vst3_u8(outptr0, rgb0_l);
+      vst3_u8(outptr1, rgb1_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 6:
+      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 5:
+      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 4:
+      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 3:
+      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 2:
+      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    case 1:
+      vst3_lane_u8(outptr0, rgb0_l, 0);
+      vst3_lane_u8(outptr1, rgb1_l, 0);
+      FALLTHROUGH               /*FALLTHROUGH*/
+    default:
+      break;
+    }
+#endif
+  }
+}
diff --git a/media/libjpeg/simd/arm/jdsample-neon.c b/media/libjpeg/simd/arm/jdsample-neon.c
new file mode 100644
index 0000000000..90ec6782c4
--- /dev/null
+++ b/media/libjpeg/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            |         |         |         |
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *            |         |         |         |
+ *            +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ *     p0(upsampled) = s0
+ *     p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    /* First pixel component value in this row of the original image */
+    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
+     * For p1: containing sample = s0, nearest neighboring sample = s1
+     * For p2: containing sample = s1, nearest neighboring sample = s0
+     */
+    uint8x16_t s0 = vld1q_u8(inptr);
+    uint8x16_t s1 = vld1q_u8(inptr + 1);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s1_add_3s0_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+    uint16x8_t s1_add_3s0_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+    uint16x8_t s0_add_3s1_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+    uint16x8_t s0_add_3s1_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+    /* Add ordered dithering bias to odd pixel values. */
+    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+    /* The offset is initially 1, because the first pixel component has already
+     * been stored.  However, in subsequent iterations of the SIMD loop, this
+     * offset is (2 * colctr - 1) to stay within the bounds of the sample
+     * buffers without having to resort to a slow scalar tail case for the last
+     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
+     * in jmemmgr.c for more details.
+     */
+    unsigned outptr_offset = 1;
+    uint8x16x2_t output_pixels;
+
+    /* We use software pipelining to maximise performance.  The code indented
+     * an extra two spaces begins the next iteration of the loop.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+        s0 = vld1q_u8(inptr + colctr - 1);
+        s1 = vld1q_u8(inptr + colctr);
+
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                         vrshrn_n_u16(s1_add_3s0_h, 2));
+      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                         vshrn_n_u16(s0_add_3s1_h, 2));
+
+        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+         * denote low half and high half respectively.
+         */
+        s1_add_3s0_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+        s1_add_3s0_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+        s0_add_3s1_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+        s0_add_3s1_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+        /* Add ordered dithering bias to odd pixel values. */
+        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr + outptr_offset, output_pixels);
+      outptr_offset = 2 * colctr - 1;
+    }
+
+    /* Complete the last iteration of the loop. */
+
+    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                       vrshrn_n_u16(s1_add_3s0_h, 2));
+    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                       vshrn_n_u16(s0_add_3s1_h, 2));
+    /* Store pixel component values to memory. */
+    vst2q_u8(outptr + outptr_offset, output_pixels);
+
+    /* Last pixel component value in this row of the original image */
+    outptr[2 * downsampled_width - 1] =
+      GETJSAMPLE(inptr[downsampled_width - 1]);
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *       sA   |         |         |         |
+ *            | p6   p7 | p8   p9 | p10  p11|
+ *            +---------+---------+---------+
+ *            | p12  p13| p14  p15| p16  p17|
+ *       sB   |         |         |         |
+ *            | p18  p19| p20  p21| p22  p23|
+ *            +---------+---------+---------+
+ *            | p24  p25| p26  p27| p28  p29|
+ *       sC   |         |         |         |
+ *            | p30  p31| p32  p33| p34  p35|
+ *            +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above.  To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1.  For example:
+ *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ *                      1/4 * (3/4 * s0B + 1/4 * s0A)
+ *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ *     p0(upsampled) = s0A
+ *     p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* First pixel component value in this row of the original image */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* Step 1: Blend samples vertically in columns s0 and s1.
+     * Leave the divide by 4 until the end, when it can be done for both
+     * dimensions at once, right-shifting by 4.
+     */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0A = vld1q_u8(inptr0);
+    uint8x16_t s0B = vld1q_u8(inptr1);
+    uint8x16_t s0C = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+                                      vget_high_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+                                      vget_high_u8(s0B), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+                                      vget_high_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+                                      vget_high_u8(s1B), three_u8);
+
+    /* Step 2: Blend the already-blended columns. */
+
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+    uint8x16x2_t output_pixels0 = { {
+      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+    } };
+    uint8x16x2_t output_pixels1 = { {
+      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+    } };
+
+    /* Store pixel component values to memory.
+     * The minimum size of the output buffer for each row is 64 bytes => no
+     * need to worry about buffer overflow here.  See "Creation of 2-D sample
+     * arrays" in jmemmgr.c for more details.
+     */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one byte.
+     * We have to re-align on a 32-byte boundary at some point before the end
+     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+     * bounds of the sample buffers without having to resort to a slow scalar
+     * tail case for the last (downsampled_width % 16) samples.  See "Creation
+     * of 2-D sample arrays" in jmemmgr.c for more details.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0A = vld1q_u8(inptr0 + colctr - 1);
+      s0B = vld1q_u8(inptr1 + colctr - 1);
+      s0C = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+                             three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+                             three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1A = vld1q_u8(inptr0 + colctr);
+      s1B = vld1q_u8(inptr1 + colctr);
+      s1C = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+                             three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+                             three_u8);
+
+      /* Step 2: Blend the already-blended columns. */
+
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Last pixel component value in this row of the original image */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/arm/jfdctfst-neon.c b/media/libjpeg/simd/arm/jfdctfst-neon.c
new file mode 100644
index 0000000000..bb371be399
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.382683433 = 12544 * 2^-15
+ *    0.541196100 = 17795 * 2^-15
+ *    0.707106781 = 23168 * 2^-15
+ *    0.306562965 =  9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jfdctint-neon.c b/media/libjpeg/simd/arm/jfdctint-neon.c
new file mode 100644
index 0000000000..ccfc07b15d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l =
+    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  int32x4_t z1_h =
+    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t col2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t col2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t col6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t row2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t row6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/media/libjpeg/simd/arm/jidctfst-neon.c b/media/libjpeg/simd/arm/jidctfst-neon.c
new file mode 100644
index 0000000000..a91be5362e
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/media/libjpeg/simd/arm/jidctint-neon.c b/media/libjpeg/simd/arm/jidctint-neon.c
new file mode 100644
index 0000000000..043b652e6c
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctint-neon.c
@@ -0,0 +1,802 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block.  This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients.  Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.)  Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ *      +-------+-------+                   +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   0   |   1   |                   |   0   |   2   |
+ *      |       |       |    transpose      |       |       |
+ *      +-------+-------+     ------>       +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   2   |   3   |                   |   1   |   3   |
+ *      |       |       |                   |       |       |
+ *      +-------+-------+                   +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right.  If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified.  On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
+ *    "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
+ *    result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
+ *    result is all zero.  For the left 4x8 half, this is handled identically
+ *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
+ *    the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block.  (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  int16_t workspace_l[8 * DCTSIZE / 2];
+  int16_t workspace_r[8 * DCTSIZE / 2];
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients in left 4x8 block. */
+  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table for left 4x8 block. */
+  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+  int16x4_t bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  if (bitmap_rows_4567 == 0) {
+    bitmap = vorr_s16(bitmap, row3);
+    bitmap = vorr_s16(bitmap, row2);
+    bitmap = vorr_s16(bitmap, row1);
+    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (left_ac_bitmap == 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l, quadrant);
+      vst4_s16(workspace_r, quadrant);
+    } else {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l, workspace_r);
+    }
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+                                   workspace_l, workspace_r);
+  }
+
+  /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+  /* Load DCT coefficients in right 4x8 block. */
+  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+  /* Load quantization table for right 4x8 block. */
+  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+  bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+  bitmap = vorr_s16(bitmap, row3);
+  bitmap = vorr_s16(bitmap, row2);
+  bitmap = vorr_s16(bitmap, row1);
+  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  /* If this remains non-zero, a "regular" second pass will be performed. */
+  int64_t right_ac_dc_bitmap = 1;
+
+  if (right_ac_bitmap == 0) {
+    bitmap = vorr_s16(bitmap, row0);
+    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (right_ac_dc_bitmap != 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+    }
+  } else {
+    if (bitmap_rows_4567 == 0) {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l + 4 * DCTSIZE / 2,
+                                    workspace_r + 4 * DCTSIZE / 2);
+    } else {
+      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                     row6, row7, quant_row0, quant_row1,
+                                     quant_row2, quant_row3, quant_row4,
+                                     quant_row5, quant_row6, quant_row7,
+                                     workspace_l + 4 * DCTSIZE / 2,
+                                     workspace_r + 4 * DCTSIZE / 2);
+    }
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+
+  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+  if (right_ac_dc_bitmap == 0) {
+    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+  } else {
+    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+  }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c.  Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  z3_s16 = vmul_s16(row4, quant_row4);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0.  This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/media/libjpeg/simd/arm/jidctred-neon.c b/media/libjpeg/simd/arm/jidctred-neon.c
new file mode 100644
index 0000000000..be9627e61d
--- /dev/null
+++ b/media/libjpeg/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/media/libjpeg/simd/arm/jquanti-neon.c b/media/libjpeg/simd/arm/jquanti-neon.c
new file mode 100644
index 0000000000..d5d95d89f6
--- /dev/null
+++ b/media/libjpeg/simd/arm/jquanti-neon.c
@@ -0,0 +1,193 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+                         DCTELEM *workspace)
+{
+  JCOEFPTR out_ptr = coef_block;
+  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+  int i;
+
+#if defined(__clang__) && (defined(__aarch64__) || defined(_M_ARM64))
+#pragma unroll
+#endif
+  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+    /* Load DCT coefficients. */
+    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+    /* Load reciprocals of quantization values. */
+    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+    /* Extract sign from coefficients. */
+    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+    /* Get absolute value of DCT coefficients. */
+    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+    /* Add correction. */
+    abs_row0 = vaddq_u16(abs_row0, corr0);
+    abs_row1 = vaddq_u16(abs_row1, corr1);
+    abs_row2 = vaddq_u16(abs_row2, corr2);
+    abs_row3 = vaddq_u16(abs_row3, corr3);
+
+    /* Multiply DCT coefficients by quantization reciprocals. */
+    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+                                                       vget_low_u16(recip0)));
+    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+                                                       vget_high_u16(recip0)));
+    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+                                                       vget_low_u16(recip1)));
+    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+                                                       vget_high_u16(recip1)));
+    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+                                                       vget_low_u16(recip2)));
+    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+                                                       vget_high_u16(recip2)));
+    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+                                                       vget_low_u16(recip3)));
+    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+                                                       vget_high_u16(recip3)));
+    /* Narrow back to 16-bit. */
+    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+    /* Since VSHR only supports an immediate as its second argument, negate the
+     * shift value and shift left.
+     */
+    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+                                           vnegq_s16(shift0)));
+    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+                                           vnegq_s16(shift1)));
+    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+                                           vnegq_s16(shift2)));
+    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+                                           vnegq_s16(shift3)));
+
+    /* Restore sign to original product. */
+    row0 = veorq_s16(row0, sign_row0);
+    row0 = vsubq_s16(row0, sign_row0);
+    row1 = veorq_s16(row1, sign_row1);
+    row1 = vsubq_s16(row1, sign_row1);
+    row2 = veorq_s16(row2, sign_row2);
+    row2 = vsubq_s16(row2, sign_row2);
+    row3 = veorq_s16(row3, sign_row3);
+    row3 = vsubq_s16(row3, sign_row3);
+
+    /* Store quantized coefficients to memory. */
+    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+  }
+}
diff --git a/media/libjpeg/simd/arm/neon-compat.h.in b/media/libjpeg/simd/arm/neon-compat.h.in
new file mode 100644
index 0000000000..d403f2289f
--- /dev/null
+++ b/media/libjpeg/simd/arm/neon-compat.h.in
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZLL(x)  __builtin_clzll(x)
+#define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/media/libjpeg/simd/i386/jccolext-avx2.asm b/media/libjpeg/simd/i386/jccolext-avx2.asm
new file mode 100644
index 0000000000..c46d684436
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-avx2.asm
@@ -0,0 +1,578 @@
+;
+; jccolext.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF016_MF033)]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [ebx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF008_MF041)]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [edx], ymm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    add         ebx, byte SIZEOF_YMMWORD           ; outptr1
+    add         edx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolext-mmx.asm b/media/libjpeg/simd/i386/jccolext-mmx.asm
new file mode 100644
index 0000000000..6357a42b2c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-mmx.asm
@@ -0,0 +1,476 @@
+;
+; jccolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                           JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                           int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_mmx)
+
+EXTN(jsimd_rgb_ycc_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_ycc_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_ycc_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=RE
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=RO
+    movq        MMWORD [wk(2)], mm4     ; wk(2)=BE
+    movq        MMWORD [wk(3)], mm5     ; wk(3)=BO
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    movq        mm7, mm1
+    movq        mm4, mm6
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF016_MF033)]  ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movq        MMWORD [wk(4)], mm1     ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movq        MMWORD [wk(5)], mm6     ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        mm1, mm1
+    pxor        mm6, mm6
+    punpcklwd   mm1, mm5                ; mm1=BOL
+    punpckhwd   mm6, mm5                ; mm6=BOH
+    psrld       mm1, 1                  ; mm1=BOL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BOH*FIX(0.500)
+
+    movq        mm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm5=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm1
+    paddd       mm4, mm6
+    paddd       mm7, mm5
+    paddd       mm4, mm5
+    psrld       mm7, SCALEBITS          ; mm7=CbOL
+    psrld       mm4, SCALEBITS          ; mm4=CbOH
+    packssdw    mm7, mm4                ; mm7=CbO
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=BE
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    movq        mm5, mm0
+    movq        mm4, mm6
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF016_MF033)]  ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF016_MF033)]  ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movq        MMWORD [wk(6)], mm0     ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(7)], mm6     ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        mm0, mm0
+    pxor        mm6, mm6
+    punpcklwd   mm0, mm1                ; mm0=BEL
+    punpckhwd   mm6, mm1                ; mm6=BEH
+    psrld       mm0, 1                  ; mm0=BEL*FIX(0.500)
+    psrld       mm6, 1                  ; mm6=BEH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm5, mm0
+    paddd       mm4, mm6
+    paddd       mm5, mm1
+    paddd       mm4, mm1
+    psrld       mm5, SCALEBITS          ; mm5=CbEL
+    psrld       mm4, SCALEBITS          ; mm4=CbEH
+    packssdw    mm5, mm4                ; mm5=CbE
+
+    psllw       mm7, BYTE_BIT
+    por         mm5, mm7                ; mm5=Cb
+    movq        MMWORD [ebx], mm5       ; Save Cb
+
+    movq        mm0, MMWORD [wk(3)]     ; mm0=BO
+    movq        mm6, MMWORD [wk(2)]     ; mm6=BE
+    movq        mm1, MMWORD [wk(1)]     ; mm1=RO
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    movq        mm7, mm0
+    movq        mm5, mm4
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF008_MF041)]  ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, MMWORD [wk(4)]
+    paddd       mm4, MMWORD [wk(5)]
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    pxor        mm3, mm3
+    pxor        mm4, mm4
+    punpcklwd   mm3, mm1                ; mm3=ROL
+    punpckhwd   mm4, mm1                ; mm4=ROH
+    psrld       mm3, 1                  ; mm3=ROL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=ROH*FIX(0.500)
+
+    movq        mm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm1=[PD_ONEHALFM1_CJ]
+
+    paddd       mm7, mm3
+    paddd       mm5, mm4
+    paddd       mm7, mm1
+    paddd       mm5, mm1
+    psrld       mm7, SCALEBITS          ; mm7=CrOL
+    psrld       mm5, SCALEBITS          ; mm5=CrOH
+    packssdw    mm7, mm5                ; mm7=CrO
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=RE
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    movq        mm1, mm6
+    movq        mm5, mm4
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     mm1, [GOTOFF(eax,PW_MF008_MF041)]  ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF008_MF041)]  ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(6)]
+    paddd       mm4, MMWORD [wk(7)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    pxor        mm2, mm2
+    pxor        mm4, mm4
+    punpcklwd   mm2, mm3                ; mm2=REL
+    punpckhwd   mm4, mm3                ; mm4=REH
+    psrld       mm2, 1                  ; mm2=REL*FIX(0.500)
+    psrld       mm4, 1                  ; mm4=REH*FIX(0.500)
+
+    movq        mm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; mm0=[PD_ONEHALFM1_CJ]
+
+    paddd       mm1, mm2
+    paddd       mm5, mm4
+    paddd       mm1, mm0
+    paddd       mm5, mm0
+    psrld       mm1, SCALEBITS          ; mm1=CrEL
+    psrld       mm5, SCALEBITS          ; mm5=CrEH
+    packssdw    mm1, mm5                ; mm1=CrE
+
+    psllw       mm7, BYTE_BIT
+    por         mm1, mm7                ; mm1=Cr
+    movq        MMWORD [edx], mm1       ; Save Cr
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    add         ebx, byte SIZEOF_MMWORD                ; outptr1
+    add         edx, byte SIZEOF_MMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolext-sse2.asm b/media/libjpeg/simd/i386/jccolext-sse2.asm
new file mode 100644
index 0000000000..c6c80852ac
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolext-sse2.asm
@@ -0,0 +1,503 @@
+;
+; jccolext.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         8
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edx
+    push        ebx
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    mov         ebx, JSAMPROW [ebx]     ; outptr1
+    mov         edx, JSAMPROW [edx]     ; outptr2
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF016_MF033)]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [ebx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]  ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF008_MF041)]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [GOTOFF(eax,PD_ONEHALFM1_CJ)]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [edx], xmm1     ; Save Cr
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    add         ebx, byte SIZEOF_XMMWORD                ; outptr1
+    add         edx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    pop         ebx
+    pop         edx
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jccolor-avx2.asm b/media/libjpeg/simd/i386/jccolor-avx2.asm
new file mode 100644
index 0000000000..14944e952f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-mmx.asm b/media/libjpeg/simd/i386/jccolor-mmx.asm
new file mode 100644
index 0000000000..8cb399bdc4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-mmx.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_mmx)
+
+EXTN(jconst_rgb_ycc_convert_mmx):
+
+PW_F0299_F0337  times 2 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 2 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 2 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 2 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 2 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extrgbx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extbgrx_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxbgr_ycc_convert_mmx
+%include "jccolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_mmx  jsimd_extxrgb_ycc_convert_mmx
+%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jccolor-sse2.asm b/media/libjpeg/simd/i386/jccolor-sse2.asm
new file mode 100644
index 0000000000..686d222ff7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-avx2.asm b/media/libjpeg/simd/i386/jcgray-avx2.asm
new file mode 100644
index 0000000000..560ee0c71e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-mmx.asm b/media/libjpeg/simd/i386/jcgray-mmx.asm
new file mode 100644
index 0000000000..79fdf082a8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-mmx.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_mmx)
+
+EXTN(jconst_rgb_gray_convert_mmx):
+
+PW_F0299_F0337 times 2 dw F_0_299, F_0_337
+PW_F0114_F0250 times 2 dw F_0_114, F_0_250
+PD_ONEHALF     times 2 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extrgbx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extbgrx_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxbgr_gray_convert_mmx
+%include "jcgryext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_mmx  jsimd_extxrgb_gray_convert_mmx
+%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jcgray-sse2.asm b/media/libjpeg/simd/i386/jcgray-sse2.asm
new file mode 100644
index 0000000000..cb4b28e8f4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jcgryext-avx2.asm b/media/libjpeg/simd/i386/jcgryext-avx2.asm
new file mode 100644
index 0000000000..3fa7973d72
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-avx2.asm
@@ -0,0 +1,457 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [esi+ecx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         ecx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [esi+ecx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         ecx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [esi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [esi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [esi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0299_F0337)]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [GOTOFF(eax,PD_ONEHALF)]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_F0114_F0250)]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [GOTOFF(eax,PD_ONEHALF)]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [edi], ymm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_YMMWORD
+    add         esi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcgryext-mmx.asm b/media/libjpeg/simd/i386/jcgryext-mmx.asm
new file mode 100644
index 0000000000..8af42e5a33
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-mmx.asm
@@ -0,0 +1,355 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
+
+EXTN(jsimd_rgb_gray_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    xor         eax, eax
+    mov         al, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    xor         edx, edx
+    mov         dx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        mmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        mmG, dword [esi+ecx]
+    psllq       mmA, DWORD_BIT
+    por         mmA, mmG
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    movq        mmG, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    mov         ecx, SIZEOF_MMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld16:
+    test        cl, 2*SIZEOF_MMWORD
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 01 11 21 02 12)
+    ; mmG=(22 03 13 23 04 14 24 05)
+    ; mmF=(15 25 06 16 26 07 17 27)
+
+    movq        mmD, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
+    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
+
+    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
+    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
+
+    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
+    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
+
+    movq        mmE, mmA
+    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
+    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
+
+    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
+
+    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
+    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
+
+    pxor        mmH, mmH
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
+
+    movq        mmB, mmE
+    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
+    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
+
+    movq        mmF, mmD
+    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
+    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_MMWORD/8
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_MMWORD/8
+    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_MMWORD/4
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_MMWORD/4
+    movq        mmF, mmA
+    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld4:
+    test        cl, SIZEOF_MMWORD/2
+    mov         ecx, SIZEOF_MMWORD
+    jz          short .rgb_gray_cnv
+    movq        mmD, mmA
+    movq        mmC, mmF
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
+    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
+
+.rgb_gray_cnv:
+    ; mmA=(00 10 20 30 01 11 21 31)
+    ; mmF=(02 12 22 32 03 13 23 33)
+    ; mmD=(04 14 24 34 05 15 25 35)
+    ; mmC=(06 16 26 36 07 17 27 37)
+
+    movq        mmB, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
+    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
+
+    movq        mmG, mmD
+    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
+    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
+
+    movq        mmE, mmA
+    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
+    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
+
+    movq        mmH, mmB
+    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
+    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
+
+    pxor        mmF, mmF
+
+    movq        mmC, mmA
+    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
+    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
+
+    movq        mmD, mmB
+    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
+    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
+
+    movq        mmG, mmE
+    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
+    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
+
+    punpcklbw   mmF, mmH
+    punpckhbw   mmH, mmH
+    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
+    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
+    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movq        mm6, mm1
+    punpcklwd   mm1, mm3
+    punpckhwd   mm6, mm3
+    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movq        mm6, mm0
+    punpcklwd   mm0, mm2
+    punpckhwd   mm6, mm2
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movq        mm0, mm5                ; mm0=BO
+    movq        mm6, mm4                ; mm6=BE
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm3
+    punpckhwd   mm4, mm3
+    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
+
+    paddd       mm0, mm1
+    paddd       mm4, mm7
+    paddd       mm0, mm3
+    paddd       mm4, mm3
+    psrld       mm0, SCALEBITS          ; mm0=YOL
+    psrld       mm4, SCALEBITS          ; mm4=YOH
+    packssdw    mm0, mm4                ; mm0=YO
+
+    movq        mm4, mm6
+    punpcklwd   mm6, mm2
+    punpckhwd   mm4, mm2
+    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
+
+    paddd       mm6, MMWORD [wk(0)]
+    paddd       mm4, MMWORD [wk(1)]
+    paddd       mm6, mm2
+    paddd       mm4, mm2
+    psrld       mm6, SCALEBITS          ; mm6=YEL
+    psrld       mm4, SCALEBITS          ; mm4=YEH
+    packssdw    mm6, mm4                ; mm6=YE
+
+    psllw       mm0, BYTE_BIT
+    por         mm6, mm0                ; mm6=Y
+    movq        MMWORD [edi], mm6       ; Save Y
+
+    sub         ecx, byte SIZEOF_MMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
+    add         edi, byte SIZEOF_MMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_MMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcgryext-sse2.asm b/media/libjpeg/simd/i386/jcgryext-sse2.asm
new file mode 100644
index 0000000000..c9d6ff1e35
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcgryext-sse2.asm
@@ -0,0 +1,382 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+%define img_width(b)   (b) + 8          ; JDIMENSION img_width
+%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
+%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
+%define output_row(b)  (b) + 20         ; JDIMENSION output_row
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [img_width(eax)]
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         esi, JSAMPIMAGE [output_buf(eax)]
+    mov         ecx, JDIMENSION [output_row(eax)]
+    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
+    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         esi, JSAMPARRAY [input_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    pushpic     eax
+    push        edi
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr0
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    alignx      16, 7
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        eax
+    push        edx
+    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_BYTE
+    movzx       eax, byte [esi+ecx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_WORD
+    movzx       edx, word [esi+ecx]
+    shl         eax, WORD_BIT
+    or          eax, edx
+.column_ld4:
+    movd        xmmA, eax
+    pop         edx
+    pop         eax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         ecx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [esi+ecx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         ecx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         ecx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         ecx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+    alignx      16, 7
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0299_F0337)]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [GOTOFF(eax,PD_ONEHALF)]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [GOTOFF(eax,PW_F0114_F0250)]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [GOTOFF(eax,PD_ONEHALF)]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [edi], xmm6     ; Save Y
+
+    sub         ecx, byte SIZEOF_XMMWORD
+    add         esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .column_ld1
+
+    pop         ecx                     ; col
+    pop         esi
+    pop         edi
+    poppic      eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         edi, byte SIZEOF_JSAMPROW
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jchuff-sse2.asm b/media/libjpeg/simd/i386/jchuff-sse2.asm
new file mode 100644
index 0000000000..278cf5e83a
--- /dev/null
+++ b/media/libjpeg/simd/i386/jchuff-sse2.asm
@@ -0,0 +1,761 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+    alignz      32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+               dq 0x000f, 0x001f, 0x003f, 0x007f
+               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+
+    alignz      32
+
+%ifdef PIC
+%define NBITS(x)      nbits_base + x
+%else
+%define NBITS(x)      jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%define mm_put_buffer     mm0
+%define mm_all_0xff       mm1
+%define mm_temp           mm2
+%define mm_nbits          mm3
+%define mm_code_bits      mm3
+%define mm_code           mm4
+%define mm_overflow_bits  mm5
+%define mm_save_nbits     mm6
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values, or
+;      mmN treated as eight unsigned 8-bit values
+; bN[i]:  perform the same operation on all unsigned 8-bit values,
+;         i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp   %1
+%define %%tempb  %2
+%define %%temph  %3
+    add         nbits, free_bits             ; nbits += free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    movq        mm_temp, mm_code             ; temp = code;
+    movd        mm_nbits, nbits              ; nbits --> MMX register
+    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
+    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
+    add         free_bits, 64                ; free_bits += 64;
+    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+    movd        mm_save_nbits, nbits_base    ; save nbits_base
+%endif
+    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
+    movq        mm_put_buffer, mm_code       ; put_buffer = code;
+    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+    movq        mm_code, mm_code_bits        ; code = code_bits;
+    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
+    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
+    movd        %%temp, mm_code_bits         ; temp = code_bits;
+    bswap       %%temp                       ; temp = htonl(temp);
+    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
+    jnz         %%.SLOW                      ;   goto %%.SLOW
+    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits    ; restore nbits_base
+%endif
+    %4
+    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
+    %5
+    bswap       nbits                        ; nbits = htonl(nbits);
+    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
+    lea         buffer, [buffer + 8]         ; buffer += 8;
+    %6
+    %7
+    %8
+    jmp %9                                   ; return
+%%.SLOW:
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         %%temp, 16                 ; temp >>= 16;
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits  ; restore nbits_base
+%endif
+    bswap       nbits                      ; nbits = htonl(nbits)
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    shr         nbits, 16                  ; nbits >>= 16;
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    %4
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    %5
+    %6
+    %7
+    %8
+    jmp %9                                 ; return;
+%endmacro
+
+%macro PUSH 1
+    push        %1
+%assign stack_offset  stack_offset + 4
+%endmacro
+
+%macro POP 1
+    pop         %1
+%assign stack_offset  stack_offset - 4
+%endmacro
+
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register.  Equivalent to
+;   get_GOT     %1
+;   lea         %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+    call        %%.geteip
+%%.ref:
+    %4
+    add         %1, %2 - %%.ref
+    jmp         short %%.done
+    align       32
+%%.geteip:
+    %3          4               ; must adjust stack pointer because of call
+    mov         %1, POINTER [esp]
+    ret
+    align       32
+%%.done:
+%else
+    %3          0
+    %4
+%endif
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame       eax
+%ifdef PIC
+%define nbits_base  ebx
+%endif
+%define emit_temp   ebx
+%define emit_tempb  bl
+%define emit_temph  bh
+%define dctbl       ecx
+%define block       edx
+%define code_temp   esi
+%define index_temp  edi
+%define t           esp
+%define index       ebp
+
+%assign save_frame  DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%assign stack_offset      0
+%define arg_state         4 + stack_offset
+%define arg_buffer        8 + stack_offset
+%define arg_block        12 + stack_offset
+%define arg_last_dc_val  16 + stack_offset
+%define arg_dctbl        20 + stack_offset
+%define arg_actbl        24 + stack_offset
+
+                                                          ;X: X = code stream
+    mov         block, [esp + arg_block]
+    PUSH        ebx
+    PUSH        ebp
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    PUSH        esi
+    PUSH        edi
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    mov         frame, esp
+    lea         t, [frame - (save_frame + 4)]
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
+    mov         [t + save_frame], frame
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+                                                          ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
+    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    or          index, index_temp                         ;Z:     index |= index_temp;
+%undef index_temp
+%define free_bits  edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    not         index                                     ;Z:     index = ~index;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    mov         dctbl, [frame + arg_dctbl]
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
+%endmacro
+
+    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
+                                                          ;Z:     i.e. if code_temp is positive
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define nbits  edx
+%define nbitsb  dl
+%define nbitsh  dh
+    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state  esi
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    mov         state, [frame + arg_state]
+    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+                                                          ;Z:     code = dctbl->ehufco[nbits];
+%define size  ecx
+%define sizeb  cl
+%define sizeh  ch
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
+    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+                                                          ;Z:     size = dctbl->ehufsi[nbits];
+%undef dctbl
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    mov         free_bits, [state + working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+%undef state
+%define actbl  esi
+    mov         actbl, [frame + arg_actbl]
+%define buffer  eax
+    mov         buffer, [frame + arg_buffer]
+%undef frame
+    jmp        .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+; size <= 32, so this is not really a loop
+.BRLOOP1:                                                 ; .BRLOOP1:
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ; nbits = actbl->ehufsi[0xf0];
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ; code = actbl->ehufco[0xf0];
+    and         index, 0x7ffffff                          ; clear index if size == 32
+    sub         size, 16                                  ; size -= 16;
+    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
+    movd        mm_nbits, nbits                           ; nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
+    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+%ifdef PIC
+    times 6     nop
+%else
+    times 2     nop
+%endif
+.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged.  Correct in .BRLOOP.
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
+.ERLOOP1:                                                 ; .ERLOOP1:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+%ifdef PIC
+    add         size, size                                ;   size += size;
+%else
+    lea         size, [size * 2]                          ;   size += size;
+%endif
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+.BEGIN:                                                   ; .BEGIN:
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    add         nbits, size                               ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP1                                   ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1:                                                  ; .ELOOP1:
+    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
+    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
+    shl         size, 16                                  ; size <<= 16;
+    or          index, size                               ; index |= size;
+    not         index                                     ; index = ~index;
+    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+                                                          ; nbits = t + 1 + 64;
+    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
+    sub         nbits, t                                  ; nbits -= t;
+    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
+    tzcnt       size, index                               ; size = # of trailing 0 bits in index
+    inc         size                                      ; ++size;
+    test        index, index                              ; if (index == 0)
+    jz          .ELOOP2                                   ;   goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+    shr         index, sizeb                              ; index >>= size;
+    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
+    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
+    cmp         size, 16                                  ; if (size <= 16)
+    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
+.BRLOOP2:                                                 ; do {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    sub         size, 16                                  ;   size -= 16;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
+    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+    cmp         size, 16                                  ;     if (size <= 16)
+    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
+    jmp        .BRLOOP2                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align      16
+.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
+.ERLOOP2:                                                 ; .ERLOOP2:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+    add         size, size                                ;   size += size;
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    lea         nbits, [nbits + size]                     ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP2                                   ; } while (index != 0);
+.ELOOP2:                                                  ; .ELOOP2:
+    mov         nbits, t                                  ; nbits = t;
+    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
+    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
+    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
+    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
+    je          .EFN                                      ; {
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+                                                          ;   code = actbl->ehufco[0];
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    movd        mm_nbits, nbits                           ;     nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+.EFN:                                                     ; } }
+%define frame  esp
+    mov         frame, [t + save_frame]
+%define state  ecx
+    mov         state, [frame + arg_state]
+    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    emms
+    mov         [state + working_state.cur.free_bits], free_bits
+                                                          ; state->cur.free_bits = free_bits;
+    POP         edi
+    POP         esi
+    POP         ebp
+    POP         ebx
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP1:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
+      .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP1:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP1 }, \
+      .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP2:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
+      { cmp     size, 16 }, \
+      { jle     .ERLOOP2 }, \
+      .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP2:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP2 }, \
+      .ELOOP2
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcphuff-sse2.asm b/media/libjpeg/simd/i386/jcphuff-sse2.asm
new file mode 100644
index 0000000000..c26b48a47d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcphuff-sse2.asm
@@ -0,0 +1,662 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding (SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    mov         T1, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    mov         T1, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, XMMWORD [VALUES + (56*2)]
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         ecx, 16
+    shl         esi, 16
+
+    or          eax, ecx
+    or          edx, esi
+
+    not         eax
+    not         edx
+
+    mov         edi, ZEROBITS
+
+    mov         INT [edi], eax
+    mov         INT [edi+SIZEOF_INT], edx
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *zerobits
+
+%define ZERO    xmm7
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define LEN     ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 4
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         ZEROBITS, T0
+    mov         LEN, INT [eax + 16]
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    and         K, -16
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    mov         LENEND, LEN
+    and         LENEND, 7
+
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; eax + 8 = const JCOEF *block
+; eax + 12 = const int *jpeg_natural_order_start
+; eax + 16 = int Sl
+; eax + 20 = int Al
+; eax + 24 = JCOEF *values
+; eax + 28 = size_t *bits
+
+%define ZERO    xmm7
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LENEND  eax
+%define LUT     ebx
+%define T0      ecx
+%define T0w      cx
+%define T1      edx
+%define BLOCK   esi
+%define VALUES  edi
+%define KK      ebp
+
+%define ZEROBITS  INT [esp + 5 * 4]
+%define EOB       INT [esp + 5 * 4 + 4]
+%define LEN       INT [esp + 5 * 4 + 8]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    sub         esp, 16
+    push        ebx
+    push        ecx
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+    push        ebp
+
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         BLOCK, INT [eax + 8]
+    mov         LUT, INT [eax + 12]
+    mov         VALUES, INT [eax + 24]
+    movd        AL, INT [eax + 20]
+    mov         T0, INT [eax + 28]
+    mov         K,  INT [eax + 16]
+    mov         INT [T0 + 2 * SIZEOF_INT], -1
+    mov         INT [T0 + 3 * SIZEOF_INT], -1
+    mov         ZEROBITS, T0
+    mov         LEN, K
+    pxor        ZERO, ZERO
+    and         K, -16
+    mov         EOB, 0
+    xor         KK, KK
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 2
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    mov         LENEND, LEN
+
+    test        LENEND, 8
+    jz          .TRYR7
+    test        LENEND, 7
+    jz          .TRYR8
+
+    and         LENEND, 7
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    and         LENEND, 7
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0, N0                  ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    mov         T1, ZEROBITS
+    not         T0
+    mov         word [T1 + 2 * SIZEOF_INT + KK], T0w
+    pmovmskb    T1, X0                  ; idx = _mm_movemask_epi8(x1);
+    bsr         T1, T1                  ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    lea         T1, [T1+KK*8]
+    mov         EOB, T1                 ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    mov         eax, EOB
+
+    pop         ebp
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+    pop         ecx
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T1
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-avx2.asm b/media/libjpeg/simd/i386/jcsample-avx2.asm
new file mode 100644
index 0000000000..0a20802dd8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-avx2.asm
@@ -0,0 +1,388 @@
+;
+; jcsample.asm - downsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    ; ecx can possibly be 8, 16, 24
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r24:
+    cmp         ecx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         ecx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         ecx, SIZEOF_YMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         ecx, byte SIZEOF_YMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r24
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-mmx.asm b/media/libjpeg/simd/i386/jcsample-mmx.asm
new file mode 100644
index 0000000000..2c223eebe8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-mmx.asm
@@ -0,0 +1,324 @@
+;
+; jcsample.asm - downsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
+
+EXTN(jsimd_h2v1_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
+    movq        mm2, mm0
+    movq        mm3, mm1
+
+    pand        mm0, mm6
+    psrlw       mm2, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm3, BYTE_BIT
+
+    paddw       mm0, mm2
+    paddw       mm1, mm3
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+    psrlw       mm0, 1
+    psrlw       mm1, 1
+
+    packuswb    mm0, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         short .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
+;                           JDIMENSION v_samp_factor,
+;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                           JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
+
+EXTN(jsimd_h2v2_downsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        mm7, edx
+    pcmpeqw     mm6, mm6
+    punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
+    movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pand        mm0, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm1, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    pand        mm2, mm6
+    psrlw       mm4, BYTE_BIT
+    pand        mm3, mm6
+    psrlw       mm5, BYTE_BIT
+    paddw       mm2, mm4
+    paddw       mm3, mm5
+
+    paddw       mm0, mm1
+    paddw       mm2, mm3
+    paddw       mm0, mm7
+    paddw       mm2, mm7
+    psrlw       mm0, 2
+    psrlw       mm2, 2
+
+    packuswb    mm0, mm2
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+
+    add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_MMWORD  ; outptr
+    sub         ecx, byte SIZEOF_MMWORD    ; outcol
+    jnz         near .columnloop
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jcsample-sse2.asm b/media/libjpeg/simd/i386/jcsample-sse2.asm
new file mode 100644
index 0000000000..4fea60d2e2
--- /dev/null
+++ b/media/libjpeg/simd/i386/jcsample-sse2.asm
@@ -0,0 +1,351 @@
+;
+; jcsample.asm - downsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        ecx, ecx
+    jnz         short .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         eax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+%define img_width(b)    (b) + 8         ; JDIMENSION image_width
+%define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
+%define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
+%define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
+%define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
+%define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         ecx, JDIMENSION [width_blks(ebp)]
+    shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
+    jz          near .return
+
+    mov         edx, JDIMENSION [img_width(ebp)]
+
+    ; -- expand_right_edge
+
+    push        ecx
+    shl         ecx, 1                  ; output_cols * 2
+    sub         ecx, edx
+    jle         short .expand_end
+
+    mov         eax, INT [max_v_samp(ebp)]
+    test        eax, eax
+    jle         short .expand_end
+
+    cld
+    mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
+    alignx      16, 7
+.expandloop:
+    push        eax
+    push        ecx
+
+    mov         edi, JSAMPROW [esi]
+    add         edi, edx
+    mov         al, JSAMPLE [edi-1]
+
+    rep stosb
+
+    pop         ecx
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    dec         eax
+    jg          short .expandloop
+
+.expand_end:
+    pop         ecx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         edx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
+    mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
+    alignx      16, 7
+.rowloop:
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edi, JSAMPROW [edi]                    ; outptr
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    alignx      16, 7
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         ecx, SIZEOF_XMMWORD
+    jmp         short .downsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         ecx, byte SIZEOF_XMMWORD    ; outcol
+    add         edx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         edi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        ecx, ecx
+    jnz         near .columnloop_r8
+
+    pop         esi
+    pop         edi
+    pop         ecx
+
+    add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         eax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-avx2.asm b/media/libjpeg/simd/i386/jdcolext-avx2.asm
new file mode 100644
index 0000000000..015be0416c
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-avx2.asm
@@ -0,0 +1,515 @@
+;
+; jdcolext.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [ebx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [edx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2                     ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3                     ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6                     ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7                     ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [GOTOFF(eax,PW_MF0228)]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [GOTOFF(eax,PW_F0402)]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm4, ymm4, 1                        ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                        ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm0, ymm0, 1                        ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                        ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2                     ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3                     ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6                     ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7                     ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4                ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5                ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm4, ymm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm4, ymm4, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [esi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-mmx.asm b/media/libjpeg/simd/i386/jdcolext-mmx.asm
new file mode 100644
index 0000000000..5813cfcb66
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-mmx.asm
@@ -0,0 +1,404 @@
+;
+; jdcolext.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_mmx(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                           JDIMENSION input_row, JSAMPARRAY output_buf,
+;                           int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_mmx)
+
+EXTN(jsimd_ycc_rgb_convert_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movq        mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
+    movq        mm1, MMWORD [edx]       ; mm1=Cr(01234567)
+
+    pcmpeqw     mm4, mm4
+    pcmpeqw     mm7, mm7
+    psrlw       mm4, BYTE_BIT
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+    movq        mm0, mm4                ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        mm4, mm5                ; mm4=Cb(0246)=CbE
+    psrlw       mm5, BYTE_BIT           ; mm5=Cb(1357)=CbO
+    pand        mm0, mm1                ; mm0=Cr(0246)=CrE
+    psrlw       mm1, BYTE_BIT           ; mm1=Cr(1357)=CrO
+
+    paddw       mm4, mm7
+    paddw       mm5, mm7
+    paddw       mm0, mm7
+    paddw       mm1, mm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm2, mm4                ; mm2=CbE
+    movq        mm3, mm5                ; mm3=CbO
+    paddw       mm4, mm4                ; mm4=2*CbE
+    paddw       mm5, mm5                ; mm5=2*CbO
+    movq        mm6, mm0                ; mm6=CrE
+    movq        mm7, mm1                ; mm7=CrO
+    paddw       mm0, mm0                ; mm0=2*CrE
+    paddw       mm1, mm1                ; mm1=2*CrO
+
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbE * -FIX(0.22800))
+    pmulhw      mm5, [GOTOFF(eax,PW_MF0228)]  ; mm5=(2*CbO * -FIX(0.22800))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrE * FIX(0.40200))
+    pmulhw      mm1, [GOTOFF(eax,PW_F0402)]   ; mm1=(2*CrO * FIX(0.40200))
+
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    paddw       mm5, [GOTOFF(eax,PW_ONE)]
+    psraw       mm4, 1                  ; mm4=(CbE * -FIX(0.22800))
+    psraw       mm5, 1                  ; mm5=(CbO * -FIX(0.22800))
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    paddw       mm1, [GOTOFF(eax,PW_ONE)]
+    psraw       mm0, 1                  ; mm0=(CrE * FIX(0.40200))
+    psraw       mm1, 1                  ; mm1=(CrO * FIX(0.40200))
+
+    paddw       mm4, mm2
+    paddw       mm5, mm3
+    paddw       mm4, mm2                ; mm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       mm5, mm3                ; mm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       mm0, mm6                ; mm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       mm1, mm7                ; mm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
+    movq        MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
+
+    movq        mm4, mm2
+    movq        mm5, mm3
+    punpcklwd   mm2, mm6
+    punpckhwd   mm4, mm6
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm3, mm7
+    punpckhwd   mm5, mm7
+    pmaddwd     mm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm4, SCALEBITS
+    paddd       mm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm3, SCALEBITS
+    psrad       mm5, SCALEBITS
+
+    packssdw    mm2, mm4                ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    mm3, mm5                ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       mm2, mm6                ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       mm3, mm7                ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movq        mm5, MMWORD [esi]       ; mm5=Y(01234567)
+
+    pcmpeqw     mm4, mm4
+    psrlw       mm4, BYTE_BIT           ; mm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm4, mm5                ; mm4=Y(0246)=YE
+    psrlw       mm5, BYTE_BIT           ; mm5=Y(1357)=YO
+
+    paddw       mm0, mm4                ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm5                ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm4                ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm5                ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4,  MMWORD [wk(0)]    ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
+    paddw       mm5,  MMWORD [wk(1)]    ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .nextrow
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .nextrow
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolext-sse2.asm b/media/libjpeg/simd/i386/jdcolext-sse2.asm
new file mode 100644
index 0000000000..d5572b3294
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolext-sse2.asm
@@ -0,0 +1,458 @@
+;
+; jdcolext.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+%define out_width(b)   (b) + 8          ; JDIMENSION out_width
+%define input_buf(b)   (b) + 12         ; JSAMPIMAGE input_buf
+%define input_row(b)   (b) + 16         ; JDIMENSION input_row
+%define output_buf(b)  (b) + 20         ; JSAMPARRAY output_buf
+%define num_rows(b)    (b) + 24         ; int num_rows
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [out_width(eax)]  ; num_cols
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [input_row(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+    lea         ebx, [ebx+ecx*SIZEOF_JSAMPROW]
+    lea         edx, [edx+ecx*SIZEOF_JSAMPROW]
+
+    pop         ecx
+
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         eax, INT [num_rows(eax)]
+    test        eax, eax
+    jle         near .return
+    alignx      16, 7
+.rowloop:
+    push        eax
+    push        edi
+    push        edx
+    push        ebx
+    push        esi
+    push        ecx                     ; col
+
+    mov         esi, JSAMPROW [esi]     ; inptr0
+    mov         ebx, JSAMPROW [ebx]     ; inptr1
+    mov         edx, JSAMPROW [edx]     ; inptr2
+    mov         edi, JSAMPROW [edi]     ; outptr
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [GOTOFF(eax,PW_MF0228)]  ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [GOTOFF(eax,PW_F0402)]   ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm5, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm1, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm4, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm4, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [esi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .nextrow
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    alignx      16, 7
+
+.nextrow:
+    pop         ecx
+    pop         esi
+    pop         ebx
+    pop         edx
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW
+    add         ebx, byte SIZEOF_JSAMPROW
+    add         edx, byte SIZEOF_JSAMPROW
+    add         edi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         eax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdcolor-avx2.asm b/media/libjpeg/simd/i386/jdcolor-avx2.asm
new file mode 100644
index 0000000000..e05b60d001
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-mmx.asm b/media/libjpeg/simd/i386/jdcolor-mmx.asm
new file mode 100644
index 0000000000..fb7e7bcce4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-mmx.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_mmx)
+
+EXTN(jconst_ycc_rgb_convert_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgb_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extrgbx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extbgrx_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxbgr_convert_mmx
+%include "jdcolext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_mmx  jsimd_ycc_extxrgb_convert_mmx
+%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdcolor-sse2.asm b/media/libjpeg/simd/i386/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b736255317
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-avx2.asm b/media/libjpeg/simd/i386/jdmerge-avx2.asm
new file mode 100644
index 0000000000..711e6792d0
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-mmx.asm b/media/libjpeg/simd/i386/jdmerge-mmx.asm
new file mode 100644
index 0000000000..6e8311d408
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-mmx.asm
@@ -0,0 +1,123 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_mmx)
+
+EXTN(jconst_merged_upsample_mmx):
+
+PW_F0402        times 4 dw  F_0_402
+PW_MF0228       times 4 dw -F_0_228
+PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
+PW_ONE          times 4 dw  1
+PD_ONEHALF      times 2 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_mmx  jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx  jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/i386/jdmerge-sse2.asm b/media/libjpeg/simd/i386/jdmerge-sse2.asm
new file mode 100644
index 0000000000..e32f90aa17
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/i386/jdmrgext-avx2.asm b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..e35f7282bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-avx2.asm
@@ -0,0 +1,575 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    vmovdqu     ymm6, YMMWORD [ebx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [edx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [GOTOFF(eax,PW_MF0228)]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [GOTOFF(eax,PW_MF0228)]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [GOTOFF(eax,PW_F0402)]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [GOTOFF(eax,PW_F0402)]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm4, ymm4, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm6, ymm6, 1                     ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                     ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [GOTOFF(eax,PW_ONE)]
+    vpaddw      ymm0, ymm0, [GOTOFF(eax,PW_ONE)]
+    vpsraw      ymm7, ymm7, 1                     ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                     ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5                  ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2                  ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1                  ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3                  ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6             ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7             ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm6, ymm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    vpmaddwd    ymm7, ymm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    vpaddd      ymm5, ymm5, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm6, ymm6, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [GOTOFF(eax,PD_ONEHALF)]
+    vpaddd      ymm7, ymm7, [GOTOFF(eax,PD_ONEHALF)]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [esi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         ecx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         ecx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        edi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         edi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_YMMWORD  ; inptr1
+    add         edx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st64:
+    cmp         ecx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymmD
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         ecx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         ecx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymmA
+    add         edi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         ecx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         ecx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         ecx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_YMMWORD/16*4
+    sub         ecx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdmrgext-mmx.asm b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
new file mode 100644
index 0000000000..eb3e36b475
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-mmx.asm
@@ -0,0 +1,460 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movq        mm6, MMWORD [ebx]       ; mm6=Cb(01234567)
+    movq        mm7, MMWORD [edx]       ; mm7=Cr(01234567)
+
+    pxor        mm1, mm1                ; mm1=(all 0's)
+    pcmpeqw     mm3, mm3
+    psllw       mm3, 7                  ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    movq        mm4, mm6
+    punpckhbw   mm6, mm1                ; mm6=Cb(4567)=CbH
+    punpcklbw   mm4, mm1                ; mm4=Cb(0123)=CbL
+    movq        mm0, mm7
+    punpckhbw   mm7, mm1                ; mm7=Cr(4567)=CrH
+    punpcklbw   mm0, mm1                ; mm0=Cr(0123)=CrL
+
+    paddw       mm6, mm3
+    paddw       mm4, mm3
+    paddw       mm7, mm3
+    paddw       mm0, mm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movq        mm5, mm6                ; mm5=CbH
+    movq        mm2, mm4                ; mm2=CbL
+    paddw       mm6, mm6                ; mm6=2*CbH
+    paddw       mm4, mm4                ; mm4=2*CbL
+    movq        mm1, mm7                ; mm1=CrH
+    movq        mm3, mm0                ; mm3=CrL
+    paddw       mm7, mm7                ; mm7=2*CrH
+    paddw       mm0, mm0                ; mm0=2*CrL
+
+    pmulhw      mm6, [GOTOFF(eax,PW_MF0228)]  ; mm6=(2*CbH * -FIX(0.22800))
+    pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbL * -FIX(0.22800))
+    pmulhw      mm7, [GOTOFF(eax,PW_F0402)]   ; mm7=(2*CrH * FIX(0.40200))
+    pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrL * FIX(0.40200))
+
+    paddw       mm6, [GOTOFF(eax,PW_ONE)]
+    paddw       mm4, [GOTOFF(eax,PW_ONE)]
+    psraw       mm6, 1                  ; mm6=(CbH * -FIX(0.22800))
+    psraw       mm4, 1                  ; mm4=(CbL * -FIX(0.22800))
+    paddw       mm7, [GOTOFF(eax,PW_ONE)]
+    paddw       mm0, [GOTOFF(eax,PW_ONE)]
+    psraw       mm7, 1                  ; mm7=(CrH * FIX(0.40200))
+    psraw       mm0, 1                  ; mm0=(CrL * FIX(0.40200))
+
+    paddw       mm6, mm5
+    paddw       mm4, mm2
+    paddw       mm6, mm5                ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       mm4, mm2                ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       mm7, mm1                ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       mm0, mm3                ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movq        MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
+    movq        MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
+
+    movq        mm6, mm5
+    movq        mm7, mm2
+    punpcklwd   mm5, mm1
+    punpckhwd   mm6, mm1
+    pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   mm2, mm3
+    punpckhwd   mm7, mm3
+    pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     mm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm5, SCALEBITS
+    psrad       mm6, SCALEBITS
+    paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       mm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       mm2, SCALEBITS
+    psrad       mm7, SCALEBITS
+
+    packssdw    mm5, mm6                ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    mm2, mm7                ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       mm5, mm1                ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       mm2, mm3                ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movq        MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movq        mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
+    movq        mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
+    movq        mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movq        mm7, MMWORD [esi]       ; mm7=Y(01234567)
+
+    pcmpeqw     mm6, mm6
+    psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        mm6, mm7                ; mm6=Y(0246)=YE
+    psrlw       mm7, BYTE_BIT           ; mm7=Y(1357)=YO
+
+    movq        mm1, mm0                ; mm1=mm0=(R-Y)(L/H)
+    movq        mm3, mm2                ; mm3=mm2=(G-Y)(L/H)
+    movq        mm5, mm4                ; mm5=mm4=(B-Y)(L/H)
+
+    paddw       mm0, mm6                ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+    paddw       mm1, mm7                ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+    packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
+    packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+    paddw       mm2, mm6                ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+    paddw       mm3, mm7                ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+    packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
+    packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+    paddw       mm4, mm6                ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+    paddw       mm5, mm7                ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+    packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
+    packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
+    punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
+
+    movq        mmG, mmA
+    movq        mmH, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
+    punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
+
+    psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
+    psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
+
+    movq        mmC, mmD
+    movq        mmB, mmD
+    punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
+    punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
+
+    psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
+
+    movq        mmF, mmE
+    punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
+    punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
+
+    punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
+    punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
+    punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          near .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_MMWORD
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
+    movq        mmA, mmC
+    sub         ecx, byte 2*SIZEOF_MMWORD
+    add         edi, byte 2*SIZEOF_MMWORD
+    jmp         short .column_st4
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmE
+    sub         ecx, byte SIZEOF_MMWORD
+    add         edi, byte SIZEOF_MMWORD
+.column_st4:
+    movd        eax, mmA
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st2
+    mov         dword [edi+0*SIZEOF_DWORD], eax
+    psrlq       mmA, DWORD_BIT
+    movd        eax, mmA
+    sub         ecx, byte SIZEOF_DWORD
+    add         edi, byte SIZEOF_DWORD
+.column_st2:
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi+0*SIZEOF_WORD], ax
+    shr         eax, WORD_BIT
+    sub         ecx, byte SIZEOF_WORD
+    add         edi, byte SIZEOF_WORD
+.column_st1:
+    cmp         ecx, byte SIZEOF_BYTE
+    jb          short .endcolumn
+    mov         byte [edi+0*SIZEOF_BYTE], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+    pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
+    pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+    ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+    ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+    ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+    ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+    punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
+    punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
+    punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
+    punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
+
+    movq        mmC, mmA
+    punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
+    punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
+    movq        mmG, mmB
+    punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
+    punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
+
+    movq        mmD, mmA
+    punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
+    punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
+    movq        mmH, mmC
+    punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
+    punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
+
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st16
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+    sub         ecx, byte SIZEOF_MMWORD
+    jz          short .endcolumn
+
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
+    add         esi, byte SIZEOF_MMWORD                ; inptr0
+    dec         al                                     ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_MMWORD                ; inptr1
+    add         edx, byte SIZEOF_MMWORD                ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st16:
+    cmp         ecx, byte SIZEOF_MMWORD/2
+    jb          short .column_st8
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
+    movq        mmA, mmC
+    movq        mmD, mmH
+    sub         ecx, byte SIZEOF_MMWORD/2
+    add         edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+    cmp         ecx, byte SIZEOF_MMWORD/4
+    jb          short .column_st4
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
+    movq        mmA, mmD
+    sub         ecx, byte SIZEOF_MMWORD/4
+    add         edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+    cmp         ecx, byte SIZEOF_MMWORD/8
+    jb          short .endcolumn
+    movd        dword [edi+0*SIZEOF_DWORD], mmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
+;                                JDIMENSION in_row_group_ctr,
+;                                JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, JDIMENSION [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdmrgext-sse2.asm b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..c113dc4d27
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdmrgext-sse2.asm
@@ -0,0 +1,517 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2012, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        3
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         ecx, JDIMENSION [output_width(eax)]  ; col
+    test        ecx, ecx
+    jz          near .return
+
+    push        ecx
+
+    mov         edi, JSAMPIMAGE [input_buf(eax)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(eax)]
+    mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         edi, JSAMPROW [edi]                      ; outptr
+
+    pop         ecx                     ; col
+
+    alignx      16, 7
+.columnloop:
+    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
+
+    movdqa      xmm6, XMMWORD [ebx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [edx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [GOTOFF(eax,PW_MF0228)]  ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [GOTOFF(eax,PW_MF0228)]  ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [GOTOFF(eax,PW_F0402)]   ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [GOTOFF(eax,PW_F0402)]   ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm4, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [GOTOFF(eax,PW_ONE)]
+    paddw       xmm0, [GOTOFF(eax,PW_ONE)]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm6, [GOTOFF(eax,PW_MF0344_F0285)]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [GOTOFF(eax,PW_MF0344_F0285)]
+    pmaddwd     xmm7, [GOTOFF(eax,PW_MF0344_F0285)]
+
+    paddd       xmm5, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm6, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [GOTOFF(eax,PD_ONEHALF)]
+    paddd       xmm7, [GOTOFF(eax,PD_ONEHALF)]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+    alignx      16, 7
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+    alignx      16, 7
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    lea         ecx, [ecx+ecx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         ecx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         ecx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_MMWORD
+    sub         ecx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [edi], xmmA
+    add         edi, byte SIZEOF_DWORD
+    sub         ecx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of eax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         ecx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [edi], ax
+    add         edi, byte SIZEOF_WORD
+    sub         ecx, byte SIZEOF_WORD
+    shr         eax, 16
+.column_st1:
+    ; Store the lower 1 byte of eax to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    mov         byte [edi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         ecx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        edi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         ecx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         esi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         ebx, byte SIZEOF_XMMWORD  ; inptr1
+    add         edx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+    alignx      16, 7
+
+.column_st32:
+    cmp         ecx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         ecx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+    add         edi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         ecx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [edi], xmmA
+    add         edi, byte SIZEOF_XMMWORD/8*4
+    sub         ecx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        ecx, ecx
+    jz          short .endcolumn
+    movd        XMM_DWORD [edi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)      (b) + 8    ; JDIMENSION output_width
+%define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
+%define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, POINTER [output_width(ebp)]
+
+    mov         edi, JSAMPIMAGE [input_buf(ebp)]
+    mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
+    mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+    mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+    mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+    mov         edi, JSAMPARRAY [output_buf(ebp)]
+    lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+    push        edx                     ; inptr2
+    push        ebx                     ; inptr1
+    push        esi                     ; inptr00
+    mov         ebx, esp
+
+    push        edi                     ; output_buf (outptr0)
+    push        ecx                     ; in_row_group_ctr
+    push        ebx                     ; input_buf
+    push        eax                     ; output_width
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esi, byte SIZEOF_JSAMPROW  ; inptr01
+    add         edi, byte SIZEOF_JSAMPROW  ; outptr1
+    mov         POINTER [ebx+0*SIZEOF_POINTER], esi
+    mov         POINTER [ebx-1*SIZEOF_POINTER], edi
+
+    call        near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    add         esp, byte 7*SIZEOF_DWORD
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-avx2.asm b/media/libjpeg/simd/i386/jdsample-avx2.asm
new file mode 100644
index 0000000000..a800c35e08
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-avx2.asm
@@ -0,0 +1,760 @@
+;
+; jdsample.asm - upsampling (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+    vpand       ymm7, ymm7, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    vpcmpeqb    xmm6, xmm6, xmm6
+    vpslldq     xmm6, xmm6, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm6, ymm6, ymm6, 1             ; (---- ---- ... ---- ---- ff) MSB is ff
+    vpand       ymm6, ymm6, YMMWORD [esi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [esi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm0, ymm3, ymm0                ; ymm0=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm0, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm0, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpxor       ymm0, ymm0, ymm0                ; ymm0=(all 0's)
+
+    vpmullw     ymm1, ymm1, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm2, ymm2, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_ONE)]
+    vpaddw      ymm3, ymm3, [GOTOFF(ebx,PW_TWO)]
+    vpaddw      ymm6, ymm6, [GOTOFF(ebx,PW_TWO)]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [ecx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [esi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm3        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpcmpeqb    xmm7, xmm7, xmm7
+    vpsrldq     xmm7, xmm7, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm7        ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm7        ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_YMMWORD-1
+    and         eax, byte -SIZEOF_YMMWORD
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpcmpeqb    xmm1, xmm1, xmm1
+    vpslldq     xmm1, xmm1, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm1, ymm1, ymm1, 1             ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    vpand       ymm2, ymm1, YMMWORD [edi+1*SIZEOF_YMMWORD]
+    vpand       ymm1, ymm1, YMMWORD [edx+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [ebx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [ecx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [esi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    vpxor       ymm3, ymm3, ymm3        ; ymm3=(all 0's)
+
+    vpunpckhbw  ymm4, ymm0, ymm3        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm3        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm3        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm3        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm3        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm3        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [edx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [edx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [edi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [edi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm3, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm3, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [edx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [edx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm0, ymm1, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm1, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm1, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm1, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm1, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm3, ymm3, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [edi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [edi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+
+    vperm2i128  ymm7, ymm1, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm1, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm1, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm1, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm1, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm1, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [GOTOFF(ebx,PW_THREE)]
+    vpmullw     ymm4, ymm4, [GOTOFF(ebx,PW_THREE)]
+    vpaddw      ymm1, ymm1, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm0, ymm0, [GOTOFF(ebx,PW_EIGHT)]
+    vpaddw      ymm7, ymm7, [GOTOFF(ebx,PW_SEVEN)]
+    vpaddw      ymm5, ymm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_YMMWORD
+    add         ecx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD    ; inptr
+    add         edi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (SIZEOF_YMMWORD-1)
+    and         edx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    cmp         eax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [ebx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [ebx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [edi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         eax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         esi, byte SIZEOF_YMMWORD  ; inptr
+    add         ebx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         edi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-mmx.asm b/media/libjpeg/simd/i386/jdsample-mmx.asm
new file mode 100644
index 0000000000..12c49f0eab
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-mmx.asm
@@ -0,0 +1,731 @@
+;
+; jdsample.asm - upsampling (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_mmx)
+
+EXTN(jconst_fancy_upsample_mmx):
+
+PW_ONE   times 4 dw 1
+PW_TWO   times 4 dw 2
+PW_THREE times 4 dw 3
+PW_SEVEN times 4 dw 7
+PW_EIGHT times 4 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v1_fancy_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        mm0, mm0                ; mm0=(all 0's)
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm7,  MMWORD [esi+0*SIZEOF_MMWORD]
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     mm6, mm6
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+    pand        mm6, MMWORD [esi+0*SIZEOF_MMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movq        mm6, MMWORD [esi+1*SIZEOF_MMWORD]
+    psllq       mm6, (SIZEOF_MMWORD-1)*BYTE_BIT
+
+.upsample:
+    movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
+    movq        mm2, mm1
+    movq        mm3, mm1                ; mm1=( 0 1 2 3 4 5 6 7)
+    psllq       mm2, BYTE_BIT           ; mm2=( - 0 1 2 3 4 5 6)
+    psrlq       mm3, BYTE_BIT           ; mm3=( 1 2 3 4 5 6 7 -)
+
+    por         mm2, mm7                ; mm2=(-1 0 1 2 3 4 5 6)
+    por         mm3, mm6                ; mm3=( 1 2 3 4 5 6 7 8)
+
+    movq        mm7, mm1
+    psrlq       mm7, (SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
+
+    movq        mm4, mm1
+    punpcklbw   mm1, mm0                ; mm1=( 0 1 2 3)
+    punpckhbw   mm4, mm0                ; mm4=( 4 5 6 7)
+    movq        mm5, mm2
+    punpcklbw   mm2, mm0                ; mm2=(-1 0 1 2)
+    punpckhbw   mm5, mm0                ; mm5=( 3 4 5 6)
+    movq        mm6, mm3
+    punpcklbw   mm3, mm0                ; mm3=( 1 2 3 4)
+    punpckhbw   mm6, mm0                ; mm6=( 5 6 7 8)
+
+    pmullw      mm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       mm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       mm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       mm2, mm1
+    paddw       mm5, mm4
+    psrlw       mm2, 2                  ; mm2=OutLE=( 0  2  4  6)
+    psrlw       mm5, 2                  ; mm5=OutHE=( 8 10 12 14)
+    paddw       mm3, mm1
+    paddw       mm6, mm4
+    psrlw       mm3, 2                  ; mm3=OutLO=( 1  3  5  7)
+    psrlw       mm6, 2                  ; mm6=OutHO=( 9 11 13 15)
+
+    psllw       mm3, BYTE_BIT
+    psllw       mm6, BYTE_BIT
+    por         mm2, mm3                ; mm2=OutL=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm6                ; mm5=OutH=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm5
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_mmx(int max_v_samp_factor,
+;                               JDIMENSION downsampled_width,
+;                               JSAMPARRAY input_data,
+;                               JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_mmx)
+
+EXTN(jsimd_h2v2_fancy_upsample_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_MMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movq        mm0, MMWORD [ebx+0*SIZEOF_MMWORD]  ; mm0=row[ 0][0]
+    movq        mm1, MMWORD [ecx+0*SIZEOF_MMWORD]  ; mm1=row[-1][0]
+    movq        mm2, MMWORD [esi+0*SIZEOF_MMWORD]  ; mm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][0]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][0]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][0]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][0]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][0]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][0]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     mm7, mm7
+    psrlq       mm7, (SIZEOF_MMWORD-2)*BYTE_BIT
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm6
+
+    pand        mm1, mm7                ; mm1=( 0 - - -)
+    pand        mm2, mm7                ; mm2=( 0 - - -)
+
+    movq        MMWORD [wk(0)], mm1
+    movq        MMWORD [wk(1)], mm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_MMWORD-1
+    and         eax, byte -SIZEOF_MMWORD
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     mm1, mm1
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT
+    movq        mm2, mm1
+
+    pand        mm1, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm1=( - - - 7)
+    pand        mm2, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm2=( - - - 7)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movq        mm0, MMWORD [ebx+1*SIZEOF_MMWORD]  ; mm0=row[ 0][1]
+    movq        mm1, MMWORD [ecx+1*SIZEOF_MMWORD]  ; mm1=row[-1][1]
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]  ; mm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        mm3, mm3                ; mm3=(all 0's)
+    movq        mm4, mm0
+    punpcklbw   mm0, mm3                ; mm0=row[ 0][1]( 0 1 2 3)
+    punpckhbw   mm4, mm3                ; mm4=row[ 0][1]( 4 5 6 7)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm3                ; mm1=row[-1][1]( 0 1 2 3)
+    punpckhbw   mm5, mm3                ; mm5=row[-1][1]( 4 5 6 7)
+    movq        mm6, mm2
+    punpcklbw   mm2, mm3                ; mm2=row[+1][1]( 0 1 2 3)
+    punpckhbw   mm6, mm3                ; mm6=row[+1][1]( 4 5 6 7)
+
+    pmullw      mm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       mm1, mm0                ; mm1=Int0L=( 0 1 2 3)
+    paddw       mm5, mm4                ; mm5=Int0H=( 4 5 6 7)
+    paddw       mm2, mm0                ; mm2=Int1L=( 0 1 2 3)
+    paddw       mm6, mm4                ; mm6=Int1H=( 4 5 6 7)
+
+    movq        MMWORD [edx+2*SIZEOF_MMWORD], mm1  ; temporarily save
+    movq        MMWORD [edx+3*SIZEOF_MMWORD], mm5  ; the intermediate data
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm6
+
+    psllq       mm1, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
+    psllq       mm2, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
+
+    movq        MMWORD [wk(2)], mm1
+    movq        MMWORD [wk(3)], mm2
+
+.upsample:
+    ; -- process the upper row
+
+    movq        mm7, MMWORD [edx+0*SIZEOF_MMWORD]  ; mm7=Int0L=( 0 1 2 3)
+    movq        mm3, MMWORD [edx+1*SIZEOF_MMWORD]  ; mm3=Int0H=( 4 5 6 7)
+
+    movq        mm0, mm7
+    movq        mm4, mm3
+    psrlq       mm0, 2*BYTE_BIT                  ; mm0=( 1 2 3 -)
+    psllq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
+    movq        mm5, mm7
+    movq        mm6, mm3
+    psrlq       mm5, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
+    psllq       mm6, 2*BYTE_BIT                  ; mm6=( - 4 5 6)
+
+    por         mm0, mm4                         ; mm0=( 1 2 3 4)
+    por         mm5, mm6                         ; mm5=( 3 4 5 6)
+
+    movq        mm1, mm7
+    movq        mm2, mm3
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm2, 2*BYTE_BIT                  ; mm2=( 5 6 7 -)
+    movq        mm4, mm3
+    psrlq       mm4, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
+
+    por         mm1, MMWORD [wk(0)]              ; mm1=(-1 0 1 2)
+    por         mm2, MMWORD [wk(2)]              ; mm2=( 5 6 7 8)
+
+    movq        MMWORD [wk(0)], mm4
+
+    pmullw      mm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm7
+    paddw       mm5, mm3
+    psrlw       mm1, 4                  ; mm1=Out0LE=( 0  2  4  6)
+    psrlw       mm5, 4                  ; mm5=Out0HE=( 8 10 12 14)
+    paddw       mm0, mm7
+    paddw       mm2, mm3
+    psrlw       mm0, 4                  ; mm0=Out0LO=( 1  3  5  7)
+    psrlw       mm2, 4                  ; mm2=Out0HO=( 9 11 13 15)
+
+    psllw       mm0, BYTE_BIT
+    psllw       mm2, BYTE_BIT
+    por         mm1, mm0                ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
+    por         mm5, mm2                ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edx+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edx+1*SIZEOF_MMWORD], mm5
+
+    ; -- process the lower row
+
+    movq        mm6, MMWORD [edi+0*SIZEOF_MMWORD]  ; mm6=Int1L=( 0 1 2 3)
+    movq        mm4, MMWORD [edi+1*SIZEOF_MMWORD]  ; mm4=Int1H=( 4 5 6 7)
+
+    movq        mm7, mm6
+    movq        mm3, mm4
+    psrlq       mm7, 2*BYTE_BIT                  ; mm7=( 1 2 3 -)
+    psllq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
+    movq        mm0, mm6
+    movq        mm2, mm4
+    psrlq       mm0, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
+    psllq       mm2, 2*BYTE_BIT                  ; mm2=( - 4 5 6)
+
+    por         mm7, mm3                         ; mm7=( 1 2 3 4)
+    por         mm0, mm2                         ; mm0=( 3 4 5 6)
+
+    movq        mm1, mm6
+    movq        mm5, mm4
+    psllq       mm1, 2*BYTE_BIT                  ; mm1=( - 0 1 2)
+    psrlq       mm5, 2*BYTE_BIT                  ; mm5=( 5 6 7 -)
+    movq        mm3, mm4
+    psrlq       mm3, (SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
+
+    por         mm1, MMWORD [wk(1)]              ; mm1=(-1 0 1 2)
+    por         mm5, MMWORD [wk(3)]              ; mm5=( 5 6 7 8)
+
+    movq        MMWORD [wk(1)], mm3
+
+    pmullw      mm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      mm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       mm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       mm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       mm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       mm1, mm6
+    paddw       mm0, mm4
+    psrlw       mm1, 4                  ; mm1=Out1LE=( 0  2  4  6)
+    psrlw       mm0, 4                  ; mm0=Out1HE=( 8 10 12 14)
+    paddw       mm7, mm6
+    paddw       mm5, mm4
+    psrlw       mm7, 4                  ; mm7=Out1LO=( 1  3  5  7)
+    psrlw       mm5, 4                  ; mm5=Out1HO=( 9 11 13 15)
+
+    psllw       mm7, BYTE_BIT
+    psllw       mm5, BYTE_BIT
+    por         mm1, mm7                ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
+    por         mm0, mm5                ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_MMWORD
+    add         ecx, byte 1*SIZEOF_MMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_MMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_MMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_MMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_MMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_mmx)
+
+EXTN(jsimd_h2v1_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_mmx(int max_v_samp_factor, JDIMENSION output_width,
+;                         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_mmx)
+
+EXTN(jsimd_h2v2_upsample_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_MMWORD)-1
+    and         edx, byte -(2*SIZEOF_MMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
+
+    movq        mm1, mm0
+    punpcklbw   mm0, mm0
+    punpckhbw   mm1, mm1
+
+    movq        MMWORD [ebx+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [ebx+1*SIZEOF_MMWORD], mm1
+    movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
+    movq        MMWORD [edi+1*SIZEOF_MMWORD], mm1
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    movq        mm2, MMWORD [esi+1*SIZEOF_MMWORD]
+
+    movq        mm3, mm2
+    punpcklbw   mm2, mm2
+    punpckhbw   mm3, mm3
+
+    movq        MMWORD [ebx+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [ebx+3*SIZEOF_MMWORD], mm3
+    movq        MMWORD [edi+2*SIZEOF_MMWORD], mm2
+    movq        MMWORD [edi+3*SIZEOF_MMWORD], mm3
+
+    sub         eax, byte 2*SIZEOF_MMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_MMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_MMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_MMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+    emms                                ; empty MMX state
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jdsample-sse2.asm b/media/libjpeg/simd/i386/jdsample-sse2.asm
new file mode 100644
index 0000000000..4e28d2f4b8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jdsample-sse2.asm
@@ -0,0 +1,724 @@
+;
+; jdsample.asm - upsampling (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    mov         eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+    alignx      16, 7
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm2, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm5, [GOTOFF(ebx,PW_ONE)]
+    paddw       xmm3, [GOTOFF(ebx,PW_TWO)]
+    paddw       xmm6, [GOTOFF(ebx,PW_TWO)]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         eax
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define downsamp_width(b)   (b) + 12    ; JDIMENSION downsampled_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        4
+%define gotptr        wk(0) - SIZEOF_POINTER  ; void *gotptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     eax                     ; make a room for GOT address
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+    movpic      POINTER [gotptr], ebx   ; save GOT address
+
+    mov         edx, eax                ; edx = original ebp
+    mov         eax, JDIMENSION [downsamp_width(edx)]  ; colctr
+    test        eax, eax
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(edx)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(edx)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(edx)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        eax                     ; colctr
+    push        ecx
+    push        edi
+    push        esi
+
+    mov         ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        eax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        edx
+    mov         dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         edx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    poppic      ebx
+
+    add         eax, byte SIZEOF_XMMWORD-1
+    and         eax, byte -SIZEOF_XMMWORD
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+    alignx      16, 7
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+    alignx      16, 7
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pushpic     ebx
+    movpic      ebx, POINTER [gotptr]   ; load GOT address
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm3, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm5, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm2, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [GOTOFF(ebx,PW_THREE)]
+    pmullw      xmm4, [GOTOFF(ebx,PW_THREE)]
+    paddw       xmm1, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm0, [GOTOFF(ebx,PW_EIGHT)]
+    paddw       xmm7, [GOTOFF(ebx,PW_SEVEN)]
+    paddw       xmm5, [GOTOFF(ebx,PW_SEVEN)]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
+
+    poppic      ebx
+
+    sub         eax, byte SIZEOF_XMMWORD
+    add         ecx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         ebx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         esi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         edx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         eax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         esi
+    pop         edi
+    pop         ecx
+    pop         eax
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          short .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          short .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]     ; inptr
+    mov         edi, JSAMPROW [edi]     ; outptr
+    mov         eax, edx                ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         ecx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+%define max_v_samp(b)       (b) + 8     ; int max_v_samp_factor
+%define output_width(b)     (b) + 12    ; JDIMENSION output_width
+%define input_data(b)       (b) + 16    ; JSAMPARRAY input_data
+%define output_data_ptr(b)  (b) + 20    ; JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         edx, JDIMENSION [output_width(ebp)]
+    add         edx, byte (2*SIZEOF_XMMWORD)-1
+    and         edx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         ecx, INT [max_v_samp(ebp)]  ; rowctr
+    test        ecx, ecx
+    jz          near .return
+
+    mov         esi, JSAMPARRAY [input_data(ebp)]    ; input_data
+    mov         edi, POINTER [output_data_ptr(ebp)]
+    mov         edi, JSAMPARRAY [edi]                ; output_data
+    alignx      16, 7
+.rowloop:
+    push        edi
+    push        esi
+
+    mov         esi, JSAMPROW [esi]                    ; inptr
+    mov         ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         eax, edx                               ; colctr
+    alignx      16, 7
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         eax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         esi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         ebx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         edi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+    alignx      16, 7
+
+.nextrow:
+    pop         esi
+    pop         edi
+
+    add         esi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         edi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         ecx, byte 2                  ; rowctr
+    jg          short .rowloop
+
+.return:
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctflt-3dn.asm b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
new file mode 100644
index 0000000000..322ab16325
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-3dn.asm
@@ -0,0 +1,318 @@
+;
+; jfdctflt.asm - floating-point FDCT (3DNow!)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_3dnow)
+
+EXTN(jconst_fdct_float_3dnow):
+
+PD_0_382 times 2 dd 0.382683432365089771728460
+PD_0_707 times 2 dd 0.707106781186547524400844
+PD_0_541 times 2 dd 0.541196100146196984399723
+PD_1_306 times 2 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_3dnow(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_3dnow)
+
+EXTN(jsimd_fdct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 10)=data0
+    punpckhdq   mm4, mm1                ; mm4=(01 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(06 16)=data6
+    punpckhdq   mm5, mm3                ; mm5=(07 17)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(02 12)=data2
+    punpckhdq   mm4, mm3                ; mm4=(03 13)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(04 14)=data4
+    punpckhdq   mm0, mm5                ; mm0=(05 15)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
+
+    movq        mm4, mm0                ; transpose coefficients
+    punpckldq   mm0, mm1                ; mm0=(00 01)=data0
+    punpckhdq   mm4, mm1                ; mm4=(10 11)=data1
+    movq        mm5, mm2                ; transpose coefficients
+    punpckldq   mm2, mm3                ; mm2=(60 61)=data6
+    punpckhdq   mm5, mm3                ; mm5=(70 71)=data7
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm2                ; mm4=data1-data6=tmp6
+    pfsub       mm0, mm5                ; mm0=data0-data7=tmp7
+    pfadd       mm6, mm2                ; mm6=data1+data6=tmp1
+    pfadd       mm7, mm5                ; mm7=data0+data7=tmp0
+
+    movq        mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=tmp7
+
+    movq        mm4, mm1                ; transpose coefficients
+    punpckldq   mm1, mm3                ; mm1=(20 21)=data2
+    punpckhdq   mm4, mm3                ; mm4=(30 31)=data3
+    movq        mm0, mm2                ; transpose coefficients
+    punpckldq   mm2, mm5                ; mm2=(40 41)=data4
+    punpckhdq   mm0, mm5                ; mm0=(50 51)=data5
+
+    movq        mm3, mm4
+    movq        mm5, mm1
+    pfadd       mm4, mm2                ; mm4=data3+data4=tmp3
+    pfadd       mm1, mm0                ; mm1=data2+data5=tmp2
+    pfsub       mm3, mm2                ; mm3=data3-data4=tmp4
+    pfsub       mm5, mm0                ; mm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm2, mm7
+    movq        mm0, mm6
+    pfsub       mm7, mm4                ; mm7=tmp13
+    pfsub       mm6, mm1                ; mm6=tmp12
+    pfadd       mm2, mm4                ; mm2=tmp10
+    pfadd       mm0, mm1                ; mm0=tmp11
+
+    pfadd       mm6, mm7
+    pfmul       mm6, [GOTOFF(ebx,PD_0_707)]  ; mm6=z1
+
+    movq        mm4, mm2
+    movq        mm1, mm7
+    pfsub       mm2, mm0                ; mm2=data4
+    pfsub       mm7, mm6                ; mm7=data6
+    pfadd       mm4, mm0                ; mm4=data0
+    pfadd       mm1, mm6                ; mm1=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=tmp6
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp7
+
+    pfadd       mm3, mm5                ; mm3=tmp10
+    pfadd       mm5, mm0                ; mm5=tmp11
+    pfadd       mm0, mm6                ; mm0=tmp12, mm6=tmp7
+
+    pfmul       mm5, [GOTOFF(ebx,PD_0_707)]  ; mm5=z3
+
+    movq        mm2, mm3                     ; mm2=tmp10
+    pfsub       mm3, mm0
+    pfmul       mm3, [GOTOFF(ebx,PD_0_382)]  ; mm3=z5
+    pfmul       mm2, [GOTOFF(ebx,PD_0_541)]  ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
+    pfmul       mm0, [GOTOFF(ebx,PD_1_306)]  ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
+    pfadd       mm2, mm3                     ; mm2=z2
+    pfadd       mm0, mm3                     ; mm0=z4
+
+    movq        mm7, mm6
+    pfsub       mm6, mm5                ; mm6=z13
+    pfadd       mm7, mm5                ; mm7=z11
+
+    movq        mm4, mm6
+    movq        mm1, mm7
+    pfsub       mm6, mm2                ; mm6=data3
+    pfsub       mm7, mm0                ; mm7=data7
+    pfadd       mm4, mm2                ; mm4=data5
+    pfadd       mm1, mm0                ; mm1=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
+
+    add         edx, byte 2*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+    femms                               ; empty MMX/3DNow! state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctflt-sse.asm b/media/libjpeg/simd/i386/jfdctflt-sse.asm
new file mode 100644
index 0000000000..86952c6499
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctflt-sse.asm
@@ -0,0 +1,369 @@
+;
+; jfdctflt.asm - floating-point FDCT (SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+%define data(b)       (b) + 8           ; FAST_FLOAT *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (FAST_FLOAT *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [GOTOFF(ebx,PD_0_707)]  ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [GOTOFF(ebx,PD_0_707)]  ; xmm3=z3
+
+    movaps      xmm1, xmm2                    ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [GOTOFF(ebx,PD_0_382)]  ; xmm2=z5
+    mulps       xmm1, [GOTOFF(ebx,PD_0_541)]  ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [GOTOFF(ebx,PD_1_306)]  ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2                    ; xmm1=z2
+    addps       xmm6, xmm2                    ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         edx, byte 4*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .columnloop
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctfst-mmx.asm b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
new file mode 100644
index 0000000000..80645a50d7
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-mmx.asm
@@ -0,0 +1,395 @@
+;
+; jfdctfst.asm - fast integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_mmx)
+
+EXTN(jconst_fdct_ifast_mmx):
+
+PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
+
+EXTN(jsimd_fdct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    psubw       mm5, mm7                ; mm5=tmp13
+    psubw       mm0, mm4                ; mm0=tmp12
+    paddw       mm1, mm7                ; mm1=tmp10
+    paddw       mm6, mm4                ; mm6=tmp11
+
+    paddw       mm0, mm5
+    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
+
+    movq        mm7, mm1
+    movq        mm4, mm5
+    psubw       mm1, mm6                ; mm1=data4
+    psubw       mm5, mm0                ; mm5=data6
+    paddw       mm7, mm6                ; mm7=data0
+    paddw       mm4, mm0                ; mm4=data2
+
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+
+    ; -- Odd part
+
+    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
+
+    paddw       mm2, mm3                ; mm2=tmp10
+    paddw       mm3, mm6                ; mm3=tmp11
+    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
+
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
+
+    movq        mm1, mm2                     ; mm1=tmp10
+    psubw       mm2, mm6
+    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
+    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
+    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
+    paddw       mm1, mm2                     ; mm1=z2
+    paddw       mm6, mm2                     ; mm6=z4
+
+    movq        mm5, mm0
+    psubw       mm0, mm3                ; mm0=z13
+    paddw       mm5, mm3                ; mm5=z11
+
+    movq        mm7, mm0
+    movq        mm4, mm5
+    psubw       mm0, mm1                ; mm0=data3
+    psubw       mm5, mm6                ; mm5=data7
+    paddw       mm7, mm1                ; mm7=data5
+    paddw       mm4, mm6                ; mm4=data1
+
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctfst-sse2.asm b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..446fa7a68f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctfst-sse2.asm
@@ -0,0 +1,403 @@
+;
+; jfdctfst.asm - fast integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [GOTOFF(ebx,PW_F0707)]  ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z3
+
+    movdqa      xmm4, xmm2                    ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F0382)]  ; xmm2=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1306)]  ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2                    ; xmm4=z2
+    paddw       xmm0, xmm2                    ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F0707)]  ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F0707)]  ; xmm0=z3
+
+    movdqa      xmm4, xmm7                    ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [GOTOFF(ebx,PW_F0382)]  ; xmm7=z5
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F0541)]  ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1306)]  ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7                    ; xmm4=z2
+    paddw       xmm1, xmm7                    ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-avx2.asm b/media/libjpeg/simd/i386/jfdctint-avx2.asm
new file mode 100644
index 0000000000..23cf733135
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-avx2.asm
@@ -0,0 +1,331 @@
+;
+; jfdctint.asm - accurate integer FDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [GOTOFF(ebx, PW_1_NEG1)]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [GOTOFF(ebx, PW_DESCALE_P2X)]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_F130_F054_MF130_F054)]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %6, %6, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [GOTOFF(ebx, PW_MF078_F117_F078_F117)]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [GOTOFF(ebx, PW_MF060_MF089_MF050_MF256)]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %4, %4, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [GOTOFF(ebx, PW_F050_MF256_F060_MF089)]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpaddd      %5, %5, [GOTOFF(ebx, PD_DESCALE_P %+ %9)]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        ebp
+    mov         ebp, esp
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(ebp)]  ; (DCTELEM *)
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-mmx.asm b/media/libjpeg/simd/i386/jfdctint-mmx.asm
new file mode 100644
index 0000000000..34a43b9e5e
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-mmx.asm
@@ -0,0 +1,620 @@
+;
+; jfdctint.asm - accurate integer FDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_mmx)
+
+EXTN(jconst_fdct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 4 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_mmx(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_mmx)
+
+EXTN(jsimd_fdct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.rowloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
+    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
+    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
+    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
+    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
+    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
+    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
+    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
+    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    psllw       mm5, PASS1_BITS         ; mm5=data0
+    psllw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm4, DESCALE_P1
+    psrad       mm1, DESCALE_P1
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm2, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm1, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       mm3, DESCALE_P1
+    psrad       mm5, DESCALE_P1
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.columnloop:
+
+    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
+    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
+
+    movq        mm4, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
+    movq        mm5, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
+    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
+
+    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+
+    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
+    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
+
+    movq        mm4, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
+    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
+    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
+    movq        mm3, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
+    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
+
+    movq        mm0, mm7
+    movq        mm5, mm6
+    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
+    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
+    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
+    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
+
+    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
+
+    movq        mm7, mm4                ; transpose coefficients(phase 2)
+    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
+    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
+    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
+
+    movq        mm2, mm7
+    movq        mm3, mm4
+    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
+    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
+    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
+    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movq        mm1, mm5
+    movq        mm6, mm0
+    paddw       mm5, mm7                ; mm5=tmp10
+    paddw       mm0, mm4                ; mm0=tmp11
+    psubw       mm1, mm7                ; mm1=tmp13
+    psubw       mm6, mm4                ; mm6=tmp12
+
+    movq        mm7, mm5
+    paddw       mm5, mm0                ; mm5=tmp10+tmp11
+    psubw       mm7, mm0                ; mm7=tmp10-tmp11
+
+    paddw       mm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       mm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       mm5, PASS1_BITS         ; mm5=data0
+    psraw       mm7, PASS1_BITS         ; mm7=data4
+
+    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
+    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movq        mm4, mm1                ; mm1=tmp13
+    movq        mm0, mm1
+    punpcklwd   mm4, mm6                ; mm6=tmp12
+    punpckhwd   mm0, mm6
+    movq        mm1, mm4
+    movq        mm6, mm0
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=data2L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F130_F054)]   ; mm0=data2H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=data6L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F054_MF130)]  ; mm6=data6H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm6, DESCALE_P2
+
+    packssdw    mm4, mm0                ; mm4=data2
+    packssdw    mm1, mm6                ; mm1=data6
+
+    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
+
+    ; -- Odd part
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp6
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp7
+
+    movq        mm0, mm2                ; mm2=tmp4
+    movq        mm6, mm3                ; mm3=tmp5
+    paddw       mm0, mm5                ; mm0=z3
+    paddw       mm6, mm7                ; mm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm4, mm0
+    movq        mm1, mm0
+    punpcklwd   mm4, mm6
+    punpckhwd   mm1, mm6
+    movq        mm0, mm4
+    movq        mm6, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF078_F117)]  ; mm4=z3L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF078_F117)]  ; mm1=z3H
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F117_F078)]   ; mm0=z4L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F117_F078)]   ; mm6=z4H
+
+    movq        MMWORD [wk(0)], mm4     ; wk(0)=z3L
+    movq        MMWORD [wk(1)], mm1     ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movq        mm4, mm2
+    movq        mm1, mm2
+    punpcklwd   mm4, mm7
+    punpckhwd   mm1, mm7
+    movq        mm2, mm4
+    movq        mm7, mm1
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm4=tmp4L
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm1=tmp4H
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF089_F060)]   ; mm2=tmp7L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF089_F060)]   ; mm7=tmp7H
+
+    paddd       mm4, MMWORD [wk(0)]     ; mm4=data7L
+    paddd       mm1, MMWORD [wk(1)]     ; mm1=data7H
+    paddd       mm2, mm0                ; mm2=data1L
+    paddd       mm7, mm6                ; mm7=data1H
+
+    paddd       mm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm4, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm2, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+
+    packssdw    mm4, mm1                ; mm4=data7
+    packssdw    mm2, mm7                ; mm2=data1
+
+    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
+
+    movq        mm1, mm3
+    movq        mm7, mm3
+    punpcklwd   mm1, mm5
+    punpckhwd   mm7, mm5
+    movq        mm3, mm1
+    movq        mm5, mm7
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm1=tmp5L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm7=tmp5H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF256_F050)]   ; mm3=tmp6L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_MF256_F050)]   ; mm5=tmp6H
+
+    paddd       mm1, mm0                ; mm1=data5L
+    paddd       mm7, mm6                ; mm7=data5H
+    paddd       mm3, MMWORD [wk(0)]     ; mm3=data3L
+    paddd       mm5, MMWORD [wk(1)]     ; mm5=data3H
+
+    paddd       mm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm1, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       mm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       mm3, DESCALE_P2
+    psrad       mm5, DESCALE_P2
+
+    packssdw    mm1, mm7                ; mm1=data5
+    packssdw    mm3, mm5                ; mm3=data3
+
+    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
+
+    add         edx, byte 4*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         near .columnloop
+
+    emms                                ; empty MMX state
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jfdctint-sse2.asm b/media/libjpeg/simd/i386/jfdctint-sse2.asm
new file mode 100644
index 0000000000..6f8e18cb9d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jfdctint-sse2.asm
@@ -0,0 +1,633 @@
+;
+; jfdctint.asm - accurate integer FDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+%define data(b)       (b) + 8           ; DCTELEM *data
+
+%define original_ebp  ebp + 0
+%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+;   push        edi                     ; unused
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process rows.
+
+    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F130_F054)]   ; xmm7=data2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=data2H
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm4=data6L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm0=data6H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3H
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F117_F078)]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P1)]
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+;   mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    paddw       xmm5, [GOTOFF(ebx,PW_DESCALE_P2X)]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=data2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F130_F054)]   ; xmm2=data2H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=data6L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm6=data6H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm4=z3L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm1=z3H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F117_F078)]   ; xmm2=z4L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F117_F078)]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [GOTOFF(ebx,PD_DESCALE_P2)]
+    paddd       xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
+
+;   pop         edi                     ; unused
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-3dn.asm b/media/libjpeg/simd/i386/jidctflt-3dn.asm
new file mode 100644
index 0000000000..87951910d8
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-3dn.asm
@@ -0,0 +1,451 @@
+;
+; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_3dnow)
+
+EXTN(jconst_idct_float_3dnow):
+
+PD_1_414        times 2 dd 1.414213562373095048801689
+PD_1_847        times 2 dd 1.847759065022573512256366
+PD_1_082        times 2 dd 1.082392200292393968799446
+PD_2_613        times 2 dd 2.613125929752753055713286
+PD_RNDINT_MAGIC times 2 dd 100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 8 db CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
+;                        JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
+
+EXTN(jsimd_idct_float_3dnow):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/2                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    pushpic     ebx                     ; save GOT address
+    mov         ebx, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    mov         eax, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    or          ebx, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    or          eax, ebx
+    poppic      ebx                     ; restore GOT address
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0
+    punpckhdq   mm1, mm1
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movd        mm0, dword [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movd        mm2, dword [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm0, mm0
+    punpcklwd   mm1, mm1
+    psrad       mm0, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm0, mm0
+    pi2fd       mm1, mm1
+
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movd        mm2, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movd        mm3, dword [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movd        mm5, dword [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movd        mm1, dword [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   mm2, mm2
+    punpcklwd   mm3, mm3
+    psrad       mm2, (DWORD_BIT-WORD_BIT)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm2, mm2
+    pi2fd       mm3, mm3
+
+    pfmul       mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    punpcklwd   mm5, mm5
+    punpcklwd   mm1, mm1
+    psrad       mm5, (DWORD_BIT-WORD_BIT)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)
+    pi2fd       mm5, mm5
+    pi2fd       mm1, mm1
+
+    pfmul       mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    pfmul       mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 01)
+    pfadd       mm7, mm3                ; mm7=data1=(10 11)
+    pfsub       mm5, mm1                ; mm5=data7=(70 71)
+    pfsub       mm0, mm3                ; mm0=data6=(60 61)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, mm6                ; transpose coefficients
+    punpckldq   mm6, mm7                ; mm6=(00 10)
+    punpckhdq   mm1, mm7                ; mm1=(01 11)
+    movq        mm3, mm0                ; transpose coefficients
+    punpckldq   mm0, mm5                ; mm0=(60 70)
+    punpckhdq   mm3, mm5                ; mm3=(61 71)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm5, MMWORD [wk(1)]     ; mm5=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm6, mm7
+    movq        mm1, mm5
+    pfadd       mm7, mm2                ; mm7=data2=(20 21)
+    pfadd       mm5, mm4                ; mm5=data4=(40 41)
+    pfsub       mm6, mm2                ; mm6=data5=(50 51)
+    pfsub       mm1, mm4                ; mm1=data3=(30 31)
+
+    movq        mm0, mm7                ; transpose coefficients
+    punpckldq   mm7, mm1                ; mm7=(20 30)
+    punpckhdq   mm0, mm1                ; mm0=(21 31)
+    movq        mm3, mm5                ; transpose coefficients
+    punpckldq   mm5, mm6                ; mm5=(40 50)
+    punpckhdq   mm3, mm6                ; mm3=(41 51)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
+
+.nextcolumn:
+    add         esi, byte 2*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 2*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/2                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pfsub       mm0, mm2                ; mm0=tmp11
+    pfsub       mm1, mm3
+    pfadd       mm4, mm2                ; mm4=tmp10
+    pfadd       mm5, mm3                ; mm5=tmp13
+
+    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
+    pfsub       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    pfsub       mm4, mm5                ; mm4=tmp3
+    pfsub       mm0, mm1                ; mm0=tmp2
+    pfadd       mm6, mm5                ; mm6=tmp0
+    pfadd       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; tmp3
+    movq        MMWORD [wk(0)], mm0     ; tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    pfadd       mm2, mm1                ; mm2=z11
+    pfadd       mm5, mm3                ; mm5=z13
+    pfsub       mm4, mm1                ; mm4=z12
+    pfsub       mm0, mm3                ; mm0=z10
+
+    movq        mm1, mm2
+    pfsub       mm2, mm5
+    pfadd       mm1, mm5                ; mm1=tmp7
+
+    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
+
+    movq        mm3, mm0
+    pfadd       mm0, mm4
+    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
+    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
+    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
+    pfsubr      mm3, mm0                     ; mm3=tmp12
+    pfsub       mm4, mm0                     ; mm4=tmp10
+
+    ; -- Final output stage
+
+    pfsub       mm3, mm1                ; mm3=tmp6
+    movq        mm5, mm6
+    movq        mm0, mm7
+    pfadd       mm6, mm1                ; mm6=data0=(00 10)
+    pfadd       mm7, mm3                ; mm7=data1=(01 11)
+    pfsub       mm5, mm1                ; mm5=data7=(07 17)
+    pfsub       mm0, mm3                ; mm0=data6=(06 16)
+    pfsub       mm2, mm3                ; mm2=tmp5
+
+    movq        mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm3, mm3
+    psrld       mm3, WORD_BIT           ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm6, mm1                ; mm6=roundint(data0/8)=(00 ** 10 **)
+    pfadd       mm7, mm1                ; mm7=roundint(data1/8)=(01 ** 11 **)
+    pfadd       mm0, mm1                ; mm0=roundint(data6/8)=(06 ** 16 **)
+    pfadd       mm5, mm1                ; mm5=roundint(data7/8)=(07 ** 17 **)
+
+    pand        mm6, mm3                ; mm6=(00 -- 10 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 01 -- 11)
+    pand        mm0, mm3                ; mm0=(06 -- 16 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 07 -- 17)
+    por         mm6, mm7                ; mm6=(00 01 10 11)
+    por         mm0, mm5                ; mm0=(06 07 16 17)
+
+    movq        mm1, MMWORD [wk(0)]     ; mm1=tmp2
+    movq        mm3, MMWORD [wk(1)]     ; mm3=tmp3
+
+    pfadd       mm4, mm2                ; mm4=tmp4
+    movq        mm7, mm1
+    movq        mm5, mm3
+    pfadd       mm1, mm2                ; mm1=data2=(02 12)
+    pfadd       mm3, mm4                ; mm3=data4=(04 14)
+    pfsub       mm7, mm2                ; mm7=data5=(05 15)
+    pfsub       mm5, mm4                ; mm5=data3=(03 13)
+
+    movq        mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     mm4, mm4
+    psrld       mm4, WORD_BIT           ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
+
+    pfadd       mm3, mm2                ; mm3=roundint(data4/8)=(04 ** 14 **)
+    pfadd       mm7, mm2                ; mm7=roundint(data5/8)=(05 ** 15 **)
+    pfadd       mm1, mm2                ; mm1=roundint(data2/8)=(02 ** 12 **)
+    pfadd       mm5, mm2                ; mm5=roundint(data3/8)=(03 ** 13 **)
+
+    pand        mm3, mm4                ; mm3=(04 -- 14 --)
+    pslld       mm7, WORD_BIT           ; mm7=(-- 05 -- 15)
+    pand        mm1, mm4                ; mm1=(02 -- 12 --)
+    pslld       mm5, WORD_BIT           ; mm5=(-- 03 -- 13)
+    por         mm3, mm7                ; mm3=(04 05 14 15)
+    por         mm1, mm5                ; mm1=(02 03 12 13)
+
+    movq        mm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm2=[PB_CENTERJSAMP]
+
+    packsswb    mm6, mm3                ; mm6=(00 01 10 11 04 05 14 15)
+    packsswb    mm1, mm0                ; mm1=(02 03 12 13 06 07 16 17)
+    paddb       mm6, mm2
+    paddb       mm1, mm2
+
+    movq        mm4, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm1                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm4, mm1                ; mm4=(04 05 06 07 14 15 16 17)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm4                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm4                ; mm7=(10 11 12 13 14 15 16 17)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 2*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 2*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse.asm b/media/libjpeg/simd/i386/jidctflt-sse.asm
new file mode 100644
index 0000000000..b27ecfdf46
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse.asm
@@ -0,0 +1,571 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse)
+
+EXTN(jconst_idct_float_sse):
+
+PD_1_414       times 4 dd  1.414213562373095048801689
+PD_1_847       times 4 dd  1.847759065022573512256366
+PD_1_082       times 4 dd  1.082392200292393968799446
+PD_M2_613      times 4 dd -2.613125929752753055713286
+PD_0_125       times 4 dd  0.125        ; 1/8
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse)
+
+EXTN(jsimd_idct_float_sse):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm1, mm0                   ; mm1=(** 02 ** 03)
+    punpcklwd   mm0, mm0                   ; mm0=(00 00 01 01)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm3, mm1                  ; xmm3=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    movlhps     xmm0, xmm3                 ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm4, mm0                ; mm4=(** 02 ** 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm5, mm1                ; mm5=(** 22 ** 23)
+    punpcklwd   mm1, mm1                ; mm1=(20 20 21 21)
+
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in0H=(02 03)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in0L=(00 01)
+    cvtpi2ps    xmm4, mm4                  ; xmm4=(02 03 ** **)
+    cvtpi2ps    xmm0, mm0                  ; xmm0=(00 01 ** **)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in2H=(22 23)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in2L=(20 21)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(22 23 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(20 21 ** **)
+
+    punpckhwd   mm6, mm2                ; mm6=(** 42 ** 43)
+    punpcklwd   mm2, mm2                ; mm2=(40 40 41 41)
+    punpckhwd   mm7, mm3                ; mm7=(** 62 ** 63)
+    punpcklwd   mm3, mm3                ; mm3=(60 60 61 61)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in4H=(42 43)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in4L=(40 41)
+    cvtpi2ps    xmm6, mm6                  ; xmm6=(42 43 ** **)
+    cvtpi2ps    xmm2, mm2                  ; xmm2=(40 41 ** **)
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in6H=(62 63)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in6L=(60 61)
+    cvtpi2ps    xmm7, mm7                  ; xmm7=(62 63 ** **)
+    cvtpi2ps    xmm3, mm3                  ; xmm3=(60 61 ** **)
+
+    movlhps     xmm0, xmm4              ; xmm0=in0=(00 01 02 03)
+    movlhps     xmm1, xmm5              ; xmm1=in2=(20 21 22 23)
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm2, xmm6              ; xmm2=in4=(40 41 42 43)
+    movlhps     xmm3, xmm7              ; xmm3=in6=(60 61 62 63)
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpckhwd   mm6, mm4                ; mm6=(** 12 ** 13)
+    punpcklwd   mm4, mm4                ; mm4=(10 10 11 11)
+    punpckhwd   mm2, mm0                ; mm2=(** 32 ** 33)
+    punpcklwd   mm0, mm0                ; mm0=(30 30 31 31)
+
+    psrad       mm6, (DWORD_BIT-WORD_BIT)  ; mm6=in1H=(12 13)
+    psrad       mm4, (DWORD_BIT-WORD_BIT)  ; mm4=in1L=(10 11)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(12 13 ** **)
+    cvtpi2ps    xmm2, mm4                  ; xmm2=(10 11 ** **)
+    psrad       mm2, (DWORD_BIT-WORD_BIT)  ; mm2=in3H=(32 33)
+    psrad       mm0, (DWORD_BIT-WORD_BIT)  ; mm0=in3L=(30 31)
+    cvtpi2ps    xmm0, mm2                  ; xmm0=(32 33 ** **)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(30 31 ** **)
+
+    punpckhwd   mm7, mm5                ; mm7=(** 52 ** 53)
+    punpcklwd   mm5, mm5                ; mm5=(50 50 51 51)
+    punpckhwd   mm3, mm1                ; mm3=(** 72 ** 73)
+    punpcklwd   mm1, mm1                ; mm1=(70 70 71 71)
+
+    movlhps     xmm2, xmm4              ; xmm2=in1=(10 11 12 13)
+    movlhps     xmm3, xmm0              ; xmm3=in3=(30 31 32 33)
+
+    psrad       mm7, (DWORD_BIT-WORD_BIT)  ; mm7=in5H=(52 53)
+    psrad       mm5, (DWORD_BIT-WORD_BIT)  ; mm5=in5L=(50 51)
+    cvtpi2ps    xmm4, mm7                  ; xmm4=(52 53 ** **)
+    cvtpi2ps    xmm5, mm5                  ; xmm5=(50 51 ** **)
+    psrad       mm3, (DWORD_BIT-WORD_BIT)  ; mm3=in7H=(72 73)
+    psrad       mm1, (DWORD_BIT-WORD_BIT)  ; mm1=in7L=(70 71)
+    cvtpi2ps    xmm0, mm3                  ; xmm0=(72 73 ** **)
+    cvtpi2ps    xmm1, mm1                  ; xmm1=(70 71 ** **)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movlhps     xmm5, xmm4              ; xmm5=in5=(50 51 52 53)
+    movlhps     xmm1, xmm0              ; xmm1=in7=(70 71 72 73)
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_0_125)]  ; xmm1=[PD_0_125]
+
+    mulps       xmm6, xmm1              ; descale(1/8)
+    mulps       xmm7, xmm1              ; descale(1/8)
+    mulps       xmm5, xmm1              ; descale(1/8)
+    mulps       xmm0, xmm1              ; descale(1/8)
+
+    movhlps     xmm3, xmm6
+    movhlps     xmm1, xmm7
+    cvtps2pi    mm0, xmm6               ; round to int32, mm0=data0L=(00 10)
+    cvtps2pi    mm1, xmm7               ; round to int32, mm1=data1L=(01 11)
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data0H=(20 30)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data1H=(21 31)
+    packssdw    mm0, mm2                ; mm0=data0=(00 10 20 30)
+    packssdw    mm1, mm3                ; mm1=data1=(01 11 21 31)
+
+    movhlps     xmm6, xmm5
+    movhlps     xmm7, xmm0
+    cvtps2pi    mm4, xmm5               ; round to int32, mm4=data7L=(07 17)
+    cvtps2pi    mm5, xmm0               ; round to int32, mm5=data6L=(06 16)
+    cvtps2pi    mm6, xmm6               ; round to int32, mm6=data7H=(27 37)
+    cvtps2pi    mm7, xmm7               ; round to int32, mm7=data6H=(26 36)
+    packssdw    mm4, mm6                ; mm4=data7=(07 17 27 37)
+    packssdw    mm5, mm7                ; mm5=data6=(06 16 26 36)
+
+    packsswb    mm0, mm5                ; mm0=(00 10 20 30 06 16 26 36)
+    packsswb    mm1, mm4                ; mm1=(01 11 21 31 07 17 27 37)
+
+    movaps      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
+    movaps      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movaps      xmm6, [GOTOFF(ebx,PD_0_125)]  ; xmm6=[PD_0_125]
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm5, xmm3
+    movaps      xmm0, xmm1
+    addps       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+    addps       xmm1, xmm4              ; xmm1=data4=(04 14 24 34)
+    subps       xmm5, xmm2              ; xmm5=data5=(05 15 25 35)
+    subps       xmm0, xmm4              ; xmm0=data3=(03 13 23 33)
+
+    mulps       xmm3, xmm6              ; descale(1/8)
+    mulps       xmm1, xmm6              ; descale(1/8)
+    mulps       xmm5, xmm6              ; descale(1/8)
+    mulps       xmm0, xmm6              ; descale(1/8)
+
+    movhlps     xmm7, xmm3
+    movhlps     xmm2, xmm1
+    cvtps2pi    mm2, xmm3               ; round to int32, mm2=data2L=(02 12)
+    cvtps2pi    mm3, xmm1               ; round to int32, mm3=data4L=(04 14)
+    cvtps2pi    mm6, xmm7               ; round to int32, mm6=data2H=(22 32)
+    cvtps2pi    mm7, xmm2               ; round to int32, mm7=data4H=(24 34)
+    packssdw    mm2, mm6                ; mm2=data2=(02 12 22 32)
+    packssdw    mm3, mm7                ; mm3=data4=(04 14 24 34)
+
+    movhlps     xmm4, xmm5
+    movhlps     xmm6, xmm0
+    cvtps2pi    mm5, xmm5               ; round to int32, mm5=data5L=(05 15)
+    cvtps2pi    mm4, xmm0               ; round to int32, mm4=data3L=(03 13)
+    cvtps2pi    mm6, xmm4               ; round to int32, mm6=data5H=(25 35)
+    cvtps2pi    mm7, xmm6               ; round to int32, mm7=data3H=(23 33)
+    packssdw    mm5, mm6                ; mm5=data5=(05 15 25 35)
+    packssdw    mm4, mm7                ; mm4=data3=(03 13 23 33)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm2, mm3                ; mm2=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm5                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm6
+    paddb       mm1, mm6
+    paddb       mm2, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm1                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm1                ; mm7=(06 07 16 17 26 27 36 37)
+    movq        mm3, mm2                ; transpose coefficients(phase 1)
+    punpcklbw   mm2, mm4                ; mm2=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm3, mm4                ; mm3=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm2                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm6, mm3                ; transpose coefficients(phase 2)
+    punpcklwd   mm3, mm7                ; mm3=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm6, mm7                ; mm6=(24 25 26 27 34 35 36 37)
+
+    movq        mm1, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm3                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm1, mm3                ; mm1=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm6                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm6                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctflt-sse2.asm b/media/libjpeg/simd/i386/jidctflt-sse2.asm
new file mode 100644
index 0000000000..c646eaef76
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctflt-sse2.asm
@@ -0,0 +1,497 @@
+;
+; jidctflt.asm - floating-point IDCT (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         ecx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
+    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0                     ; xmm3=tmp12
+    subps       xmm4, xmm0                     ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                            ; ctr
+    jnz         near .rowloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctfst-mmx.asm b/media/libjpeg/simd/i386/jidctfst-mmx.asm
new file mode 100644
index 0000000000..24622d4369
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-mmx.asm
@@ -0,0 +1,499 @@
+;
+; jidctfst.asm - fast integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_mmx)
+
+EXTN(jconst_idct_ifast_mmx):
+
+PW_F1414       times 4 dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 4 dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 4 dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 4 dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_mmx)
+
+EXTN(jsimd_idct_ifast_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 01 02 03)
+    paddw       mm7, mm0                ; mm7=data1=(10 11 12 13)
+    psubw       mm1, mm3                ; mm1=data7=(70 71 72 73)
+    psubw       mm5, mm0                ; mm5=data6=(60 61 62 63)
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    movq        mm3, mm6                ; transpose coefficients(phase 1)
+    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
+    punpckhwd   mm3, mm7                ; mm3=(02 12 03 13)
+    movq        mm0, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm1                ; mm5=(60 70 61 71)
+    punpckhwd   mm0, mm1                ; mm0=(62 72 63 73)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
+    movq        mm1, MMWORD [wk(1)]     ; mm1=tmp3
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
+    movq        MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm7
+    movq        mm0, mm1
+    paddw       mm7, mm4                ; mm7=data2=(20 21 22 23)
+    paddw       mm1, mm2                ; mm1=data4=(40 41 42 43)
+    psubw       mm5, mm4                ; mm5=data5=(50 51 52 53)
+    psubw       mm0, mm2                ; mm0=data3=(30 31 32 33)
+
+    movq        mm4, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm0                ; mm7=(20 30 21 31)
+    punpckhwd   mm4, mm0                ; mm4=(22 32 23 33)
+    movq        mm2, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm5                ; mm1=(40 50 41 51)
+    punpckhwd   mm2, mm5                ; mm2=(42 52 43 53)
+
+    movq        mm0, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(00 10 20 30)
+    punpckhdq   mm0, mm7                ; mm0=(01 11 21 31)
+    movq        mm5, mm3                ; transpose coefficients(phase 2)
+    punpckldq   mm3, mm4                ; mm3=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
+    movq        mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm6, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm7                ; mm1=(40 50 60 70)
+    punpckhdq   mm6, mm7                ; mm6=(41 51 61 71)
+    movq        mm0, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(42 52 62 72)
+    punpckhdq   mm0, mm4                ; mm0=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_IFAST_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm1
+    psubw       mm0, mm2                ; mm0=tmp11
+    psubw       mm1, mm3
+    paddw       mm4, mm2                ; mm4=tmp10
+    paddw       mm5, mm3                ; mm5=tmp13
+
+    psllw       mm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       mm1, mm5                ; mm1=tmp12
+
+    movq        mm6, mm4
+    movq        mm7, mm0
+    psubw       mm4, mm5                ; mm4=tmp3
+    psubw       mm0, mm1                ; mm0=tmp2
+    paddw       mm6, mm5                ; mm6=tmp0
+    paddw       mm7, mm1                ; mm7=tmp1
+
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=tmp3
+    movq        MMWORD [wk(0)], mm0     ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm2
+    movq        mm0, mm5
+    psubw       mm2, mm1                ; mm2=z12
+    psubw       mm5, mm3                ; mm5=z10
+    paddw       mm4, mm1                ; mm4=z11
+    paddw       mm0, mm3                ; mm0=z13
+
+    movq        mm1, mm5                ; mm1=z10(unscaled)
+    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       mm5, PRE_MULTIPLY_SCALE_BITS
+
+    movq        mm3, mm4
+    psubw       mm4, mm0
+    paddw       mm3, mm0                ; mm3=tmp7
+
+    psllw       mm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      mm4, [GOTOFF(ebx,PW_F1414)]  ; mm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movq        mm0, mm5
+    paddw       mm5, mm2
+    pmulhw      mm5, [GOTOFF(ebx,PW_F1847)]   ; mm5=z5
+    pmulhw      mm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      mm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       mm0, mm1
+    psubw       mm2, mm5                ; mm2=tmp10
+    paddw       mm0, mm5                ; mm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       mm0, mm3                ; mm0=tmp6
+    movq        mm1, mm6
+    movq        mm5, mm7
+    paddw       mm6, mm3                ; mm6=data0=(00 10 20 30)
+    paddw       mm7, mm0                ; mm7=data1=(01 11 21 31)
+    psraw       mm6, (PASS1_BITS+3)     ; descale
+    psraw       mm7, (PASS1_BITS+3)     ; descale
+    psubw       mm1, mm3                ; mm1=data7=(07 17 27 37)
+    psubw       mm5, mm0                ; mm5=data6=(06 16 26 36)
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psubw       mm4, mm0                ; mm4=tmp5
+
+    packsswb    mm6, mm5                ; mm6=(00 10 20 30 06 16 26 36)
+    packsswb    mm7, mm1                ; mm7=(01 11 21 31 07 17 27 37)
+
+    movq        mm3, MMWORD [wk(0)]     ; mm3=tmp2
+    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp3
+
+    paddw       mm2, mm4                ; mm2=tmp4
+    movq        mm5, mm3
+    movq        mm1, mm0
+    paddw       mm3, mm4                ; mm3=data2=(02 12 22 32)
+    paddw       mm0, mm2                ; mm0=data4=(04 14 24 34)
+    psraw       mm3, (PASS1_BITS+3)     ; descale
+    psraw       mm0, (PASS1_BITS+3)     ; descale
+    psubw       mm5, mm4                ; mm5=data5=(05 15 25 35)
+    psubw       mm1, mm2                ; mm1=data3=(03 13 23 33)
+    psraw       mm5, (PASS1_BITS+3)     ; descale
+    psraw       mm1, (PASS1_BITS+3)     ; descale
+
+    movq        mm4, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm4=[PB_CENTERJSAMP]
+
+    packsswb    mm3, mm0                ; mm3=(02 12 22 32 04 14 24 34)
+    packsswb    mm1, mm5                ; mm1=(03 13 23 33 05 15 25 35)
+
+    paddb       mm6, mm4
+    paddb       mm7, mm4
+    paddb       mm3, mm4
+    paddb       mm1, mm4
+
+    movq        mm2, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm7                ; mm6=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm7                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm0, mm3                ; transpose coefficients(phase 1)
+    punpcklbw   mm3, mm1                ; mm3=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm0, mm1                ; mm0=(04 05 14 15 24 25 34 35)
+
+    movq        mm5, mm6                ; transpose coefficients(phase 2)
+    punpcklwd   mm6, mm3                ; mm6=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm5, mm3                ; mm5=(20 21 22 23 30 31 32 33)
+    movq        mm4, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm2                ; mm0=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm4, mm2                ; mm4=(24 25 26 27 34 35 36 37)
+
+    movq        mm7, mm6                ; transpose coefficients(phase 3)
+    punpckldq   mm6, mm0                ; mm6=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13 14 15 16 17)
+    movq        mm1, mm5                ; transpose coefficients(phase 3)
+    punpckldq   mm5, mm4                ; mm5=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm1, mm4                ; mm1=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctfst-sse2.asm b/media/libjpeg/simd/i386/jidctfst-sse2.asm
new file mode 100644
index 0000000000..19704ffa48
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctfst-sse2.asm
@@ -0,0 +1,501 @@
+;
+; jidctfst.asm - fast integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))       ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1414)]  ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1847)]   ; xmm5=z5
+    pmulhw      xmm0, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [GOTOFF(ebx,PW_F1414)]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [GOTOFF(ebx,PW_F1414)]  ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [GOTOFF(ebx,PW_F1847)]   ; xmm4=z5
+    pmulhw      xmm6, [GOTOFF(ebx,PW_MF1613)]
+    pmulhw      xmm0, [GOTOFF(ebx,PW_F1082)]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-avx2.asm b/media/libjpeg/simd/i386/jidctint-avx2.asm
new file mode 100644
index 0000000000..199c7df3b6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-avx2.asm
@@ -0,0 +1,453 @@
+;
+; jidctint.asm - accurate integer IDCT (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [GOTOFF(ebx,PW_F130_F054_MF130_F054)]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [GOTOFF(ebx,PW_1_NEG1)]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %3, %8, %5
+    vmovdqu     %11, %3                 ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %3, %8, %5
+    vmovdqu     %9, %3                  ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %3, %1, %6
+    vmovdqu     %12, %3                 ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %3, %1, %6
+    vmovdqu     %10, %3                 ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [GOTOFF(ebx,PW_MF078_F117_F078_F117)]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [GOTOFF(ebx,PW_MF060_MF089_MF050_MF256)]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [GOTOFF(ebx,PW_MF089_F060_MF256_F050)]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vmovdqu     %3, %9
+    vmovdqu     %4, %10
+
+    vpaddd      %1, %3, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %4, %8              ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %3, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %4, %8              ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %4, %4, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vmovdqu     %7, %11
+    vmovdqu     %8, %12
+
+    vpaddd      %2, %7, %5              ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %3, %8, %6              ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %2, %2, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpackssdw   %2, %2, %3              ; %2=data3_2
+
+    vpsubd      %3, %7, %5              ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %6, %8, %6              ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %3, %3, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpaddd      %6, %6, [GOTOFF(ebx,PD_DESCALE_P %+ %13)]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %6, %6, DESCALE_P %+ %13
+    vpackssdw   %3, %3, %6              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_YMMWORD
+                                        ; ymmword wk[WK_NUM]
+%define WK_NUM         4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,esi,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,esi,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,esi,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,esi,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, XMMWORD [wk(0)], XMMWORD [wk(1)], XMMWORD [wk(2)], XMMWORD [wk(3)], 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [GOTOFF(ebx,PB_CENTERJSAMP)]
+    vpaddb      ymm1, ymm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    mov         edx, JSAMPROW [edi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-mmx.asm b/media/libjpeg/simd/i386/jidctint-mmx.asm
new file mode 100644
index 0000000000..f15c8d34bc
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-mmx.asm
@@ -0,0 +1,851 @@
+;
+; jidctint.asm - accurate integer IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_mmx)
+
+EXTN(jconst_idct_islow_mmx):
+
+PW_F130_F054   times 2 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 2 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 2 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 2 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 2 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 2 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 2 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 2 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 2 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 2 dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_mmx(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         12
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_mmx)
+
+EXTN(jsimd_idct_islow_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm1, mm0
+    packsswb    mm1, mm1
+    movd        eax, mm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm3=[PD_DESCALE_P1]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P1
+    psrad       mm7, DESCALE_P1
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 01 02 03)
+    packssdw    mm2, mm0                ; mm2=data7=(70 71 72 73)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm1=[PD_DESCALE_P1]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P1
+    psrad       mm3, DESCALE_P1
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+
+    packssdw    mm4, mm3                ; mm4=data1=(10 11 12 13)
+    packssdw    mm7, mm0                ; mm7=data6=(60 61 62 63)
+
+    movq        mm6, mm5                ; transpose coefficients(phase 1)
+    punpcklwd   mm5, mm4                ; mm5=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm1, mm7                ; transpose coefficients(phase 1)
+    punpcklwd   mm7, mm2                ; mm7=(60 70 61 71)
+    punpckhwd   mm1, mm2                ; mm1=(62 72 63 73)
+
+    movq        mm3, MMWORD [wk(6)]     ; mm3=tmp12L
+    movq        mm0, MMWORD [wk(7)]     ; mm0=tmp12H
+    movq        mm4, MMWORD [wk(10)]    ; mm4=tmp1L
+    movq        mm2, MMWORD [wk(11)]    ; mm2=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
+    movq        MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
+    movq        MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
+    movq        MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
+
+    movq        mm5, mm3
+    movq        mm6, mm0
+    paddd       mm3, mm4                ; mm3=data2L
+    paddd       mm0, mm2                ; mm0=data2H
+    psubd       mm5, mm4                ; mm5=data5L
+    psubd       mm6, mm2                ; mm6=data5H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm7=[PD_DESCALE_P1]
+
+    paddd       mm3, mm7
+    paddd       mm0, mm7
+    psrad       mm3, DESCALE_P1
+    psrad       mm0, DESCALE_P1
+    paddd       mm5, mm7
+    paddd       mm6, mm7
+    psrad       mm5, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm3, mm0                ; mm3=data2=(20 21 22 23)
+    packssdw    mm5, mm6                ; mm5=data5=(50 51 52 53)
+
+    movq        mm1, MMWORD [wk(2)]     ; mm1=tmp13L
+    movq        mm4, MMWORD [wk(3)]     ; mm4=tmp13H
+    movq        mm2, MMWORD [wk(8)]     ; mm2=tmp0L
+    movq        mm7, MMWORD [wk(9)]     ; mm7=tmp0H
+
+    movq        mm0, mm1
+    movq        mm6, mm4
+    paddd       mm1, mm2                ; mm1=data3L
+    paddd       mm4, mm7                ; mm4=data3H
+    psubd       mm0, mm2                ; mm0=data4L
+    psubd       mm6, mm7                ; mm6=data4H
+
+    movq        mm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; mm2=[PD_DESCALE_P1]
+
+    paddd       mm1, mm2
+    paddd       mm4, mm2
+    psrad       mm1, DESCALE_P1
+    psrad       mm4, DESCALE_P1
+    paddd       mm0, mm2
+    paddd       mm6, mm2
+    psrad       mm0, DESCALE_P1
+    psrad       mm6, DESCALE_P1
+
+    packssdw    mm1, mm4                ; mm1=data3=(30 31 32 33)
+    packssdw    mm0, mm6                ; mm0=data4=(40 41 42 43)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
+    movq        mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
+
+    movq        mm4, mm3                ; transpose coefficients(phase 1)
+    punpcklwd   mm3, mm1                ; mm3=(20 30 21 31)
+    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
+    movq        mm6, mm0                ; transpose coefficients(phase 1)
+    punpcklwd   mm0, mm5                ; mm0=(40 50 41 51)
+    punpckhwd   mm6, mm5                ; mm6=(42 52 43 53)
+
+    movq        mm1, mm7                ; transpose coefficients(phase 2)
+    punpckldq   mm7, mm3                ; mm7=(00 10 20 30)
+    punpckhdq   mm1, mm3                ; mm1=(01 11 21 31)
+    movq        mm5, mm2                ; transpose coefficients(phase 2)
+    punpckldq   mm2, mm4                ; mm2=(02 12 22 32)
+    punpckhdq   mm5, mm4                ; mm5=(03 13 23 33)
+
+    movq        mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
+    movq        mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpckldq   mm0, mm3                ; mm0=(40 50 60 70)
+    punpckhdq   mm7, mm3                ; mm7=(41 51 61 71)
+    movq        mm1, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm4                ; mm6=(42 52 62 72)
+    punpckhdq   mm1, mm4                ; mm1=(43 53 63 73)
+
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+    mov         ecx, DCTSIZE/4                     ; ctr
+    alignx      16, 7
+.rowloop:
+
+    ; -- Even part
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movq        mm4, mm1                ; mm1=in2=z2
+    movq        mm5, mm1
+    punpcklwd   mm4, mm3                ; mm3=in6=z3
+    punpckhwd   mm5, mm3
+    movq        mm1, mm4
+    movq        mm3, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F130_F054)]   ; mm4=tmp3L
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F130_F054)]   ; mm5=tmp3H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F054_MF130)]  ; mm1=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F054_MF130)]  ; mm3=tmp2H
+
+    movq        mm6, mm0
+    paddw       mm0, mm2                ; mm0=in0+in4
+    psubw       mm6, mm2                ; mm6=in0-in4
+
+    pxor        mm7, mm7
+    pxor        mm2, mm2
+    punpcklwd   mm7, mm0                ; mm7=tmp0L
+    punpckhwd   mm2, mm0                ; mm2=tmp0H
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+    psrad       mm2, (16-CONST_BITS)    ; psrad mm2,16 & pslld mm2,CONST_BITS
+
+    movq        mm0, mm7
+    paddd       mm7, mm4                ; mm7=tmp10L
+    psubd       mm0, mm4                ; mm0=tmp13L
+    movq        mm4, mm2
+    paddd       mm2, mm5                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp13H
+
+    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
+    movq        MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
+    movq        MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
+    movq        MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
+
+    pxor        mm5, mm5
+    pxor        mm7, mm7
+    punpcklwd   mm5, mm6                ; mm5=tmp1L
+    punpckhwd   mm7, mm6                ; mm7=tmp1H
+    psrad       mm5, (16-CONST_BITS)    ; psrad mm5,16 & pslld mm5,CONST_BITS
+    psrad       mm7, (16-CONST_BITS)    ; psrad mm7,16 & pslld mm7,CONST_BITS
+
+    movq        mm2, mm5
+    paddd       mm5, mm1                ; mm5=tmp11L
+    psubd       mm2, mm1                ; mm2=tmp12L
+    movq        mm0, mm7
+    paddd       mm7, mm3                ; mm7=tmp11H
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    movq        MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
+    movq        MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
+    movq        MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
+    movq        MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movq        mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm5, mm6
+    movq        mm7, mm4
+    paddw       mm5, mm3                ; mm5=z3
+    paddw       mm7, mm1                ; mm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movq        mm2, mm5
+    movq        mm0, mm5
+    punpcklwd   mm2, mm7
+    punpckhwd   mm0, mm7
+    movq        mm5, mm2
+    movq        mm7, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF078_F117)]  ; mm2=z3L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF078_F117)]  ; mm0=z3H
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F117_F078)]   ; mm5=z4L
+    pmaddwd     mm7, [GOTOFF(ebx,PW_F117_F078)]   ; mm7=z4H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=z3L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movq        mm2, mm3
+    movq        mm0, mm3
+    punpcklwd   mm2, mm4
+    punpckhwd   mm0, mm4
+    movq        mm3, mm2
+    movq        mm4, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm2=tmp0L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; mm0=tmp0H
+    pmaddwd     mm3, [GOTOFF(ebx,PW_MF089_F060)]   ; mm3=tmp3L
+    pmaddwd     mm4, [GOTOFF(ebx,PW_MF089_F060)]   ; mm4=tmp3H
+
+    paddd       mm2, MMWORD [wk(10)]    ; mm2=tmp0L
+    paddd       mm0, MMWORD [wk(11)]    ; mm0=tmp0H
+    paddd       mm3, mm5                ; mm3=tmp3L
+    paddd       mm4, mm7                ; mm4=tmp3H
+
+    movq        MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
+    movq        MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
+
+    movq        mm2, mm1
+    movq        mm0, mm1
+    punpcklwd   mm2, mm6
+    punpckhwd   mm0, mm6
+    movq        mm1, mm2
+    movq        mm6, mm0
+    pmaddwd     mm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm2=tmp1L
+    pmaddwd     mm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; mm0=tmp1H
+    pmaddwd     mm1, [GOTOFF(ebx,PW_MF256_F050)]   ; mm1=tmp2L
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF256_F050)]   ; mm6=tmp2H
+
+    paddd       mm2, mm5                ; mm2=tmp1L
+    paddd       mm0, mm7                ; mm0=tmp1H
+    paddd       mm1, MMWORD [wk(10)]    ; mm1=tmp2L
+    paddd       mm6, MMWORD [wk(11)]    ; mm6=tmp2H
+
+    movq        MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
+    movq        MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movq        mm5, MMWORD [wk(0)]     ; mm5=tmp10L
+    movq        mm7, MMWORD [wk(1)]     ; mm7=tmp10H
+
+    movq        mm2, mm5
+    movq        mm0, mm7
+    paddd       mm5, mm3                ; mm5=data0L
+    paddd       mm7, mm4                ; mm7=data0H
+    psubd       mm2, mm3                ; mm2=data7L
+    psubd       mm0, mm4                ; mm0=data7H
+
+    movq        mm3, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm3=[PD_DESCALE_P2]
+
+    paddd       mm5, mm3
+    paddd       mm7, mm3
+    psrad       mm5, DESCALE_P2
+    psrad       mm7, DESCALE_P2
+    paddd       mm2, mm3
+    paddd       mm0, mm3
+    psrad       mm2, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm5, mm7                ; mm5=data0=(00 10 20 30)
+    packssdw    mm2, mm0                ; mm2=data7=(07 17 27 37)
+
+    movq        mm4, MMWORD [wk(4)]     ; mm4=tmp11L
+    movq        mm3, MMWORD [wk(5)]     ; mm3=tmp11H
+
+    movq        mm7, mm4
+    movq        mm0, mm3
+    paddd       mm4, mm1                ; mm4=data1L
+    paddd       mm3, mm6                ; mm3=data1H
+    psubd       mm7, mm1                ; mm7=data6L
+    psubd       mm0, mm6                ; mm0=data6H
+
+    movq        mm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm1=[PD_DESCALE_P2]
+
+    paddd       mm4, mm1
+    paddd       mm3, mm1
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm7, mm1
+    paddd       mm0, mm1
+    psrad       mm7, DESCALE_P2
+    psrad       mm0, DESCALE_P2
+
+    packssdw    mm4, mm3                ; mm4=data1=(01 11 21 31)
+    packssdw    mm7, mm0                ; mm7=data6=(06 16 26 36)
+
+    packsswb    mm5, mm7                ; mm5=(00 10 20 30 06 16 26 36)
+    packsswb    mm4, mm2                ; mm4=(01 11 21 31 07 17 27 37)
+
+    movq        mm6, MMWORD [wk(6)]     ; mm6=tmp12L
+    movq        mm1, MMWORD [wk(7)]     ; mm1=tmp12H
+    movq        mm3, MMWORD [wk(10)]    ; mm3=tmp1L
+    movq        mm0, MMWORD [wk(11)]    ; mm0=tmp1H
+
+    movq        MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
+    movq        MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
+
+    movq        mm7, mm6
+    movq        mm2, mm1
+    paddd       mm6, mm3                ; mm6=data2L
+    paddd       mm1, mm0                ; mm1=data2H
+    psubd       mm7, mm3                ; mm7=data5L
+    psubd       mm2, mm0                ; mm2=data5H
+
+    movq        mm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm5=[PD_DESCALE_P2]
+
+    paddd       mm6, mm5
+    paddd       mm1, mm5
+    psrad       mm6, DESCALE_P2
+    psrad       mm1, DESCALE_P2
+    paddd       mm7, mm5
+    paddd       mm2, mm5
+    psrad       mm7, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    packssdw    mm6, mm1                ; mm6=data2=(02 12 22 32)
+    packssdw    mm7, mm2                ; mm7=data5=(05 15 25 35)
+
+    movq        mm4, MMWORD [wk(2)]     ; mm4=tmp13L
+    movq        mm3, MMWORD [wk(3)]     ; mm3=tmp13H
+    movq        mm0, MMWORD [wk(8)]     ; mm0=tmp0L
+    movq        mm5, MMWORD [wk(9)]     ; mm5=tmp0H
+
+    movq        mm1, mm4
+    movq        mm2, mm3
+    paddd       mm4, mm0                ; mm4=data3L
+    paddd       mm3, mm5                ; mm3=data3H
+    psubd       mm1, mm0                ; mm1=data4L
+    psubd       mm2, mm5                ; mm2=data4H
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2)]  ; mm0=[PD_DESCALE_P2]
+
+    paddd       mm4, mm0
+    paddd       mm3, mm0
+    psrad       mm4, DESCALE_P2
+    psrad       mm3, DESCALE_P2
+    paddd       mm1, mm0
+    paddd       mm2, mm0
+    psrad       mm1, DESCALE_P2
+    psrad       mm2, DESCALE_P2
+
+    movq        mm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm5=[PB_CENTERJSAMP]
+
+    packssdw    mm4, mm3                ; mm4=data3=(03 13 23 33)
+    packssdw    mm1, mm2                ; mm1=data4=(04 14 24 34)
+
+    movq        mm0, MMWORD [wk(0)]     ; mm0=(00 10 20 30 06 16 26 36)
+    movq        mm3, MMWORD [wk(1)]     ; mm3=(01 11 21 31 07 17 27 37)
+
+    packsswb    mm6, mm1                ; mm6=(02 12 22 32 04 14 24 34)
+    packsswb    mm4, mm7                ; mm4=(03 13 23 33 05 15 25 35)
+
+    paddb       mm0, mm5
+    paddb       mm3, mm5
+    paddb       mm6, mm5
+    paddb       mm4, mm5
+
+    movq        mm2, mm0                ; transpose coefficients(phase 1)
+    punpcklbw   mm0, mm3                ; mm0=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm2, mm3                ; mm2=(06 07 16 17 26 27 36 37)
+    movq        mm1, mm6                ; transpose coefficients(phase 1)
+    punpcklbw   mm6, mm4                ; mm6=(02 03 12 13 22 23 32 33)
+    punpckhbw   mm1, mm4                ; mm1=(04 05 14 15 24 25 34 35)
+
+    movq        mm7, mm0                ; transpose coefficients(phase 2)
+    punpcklwd   mm0, mm6                ; mm0=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm7, mm6                ; mm7=(20 21 22 23 30 31 32 33)
+    movq        mm5, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm2                ; mm1=(04 05 06 07 14 15 16 17)
+    punpckhwd   mm5, mm2                ; mm5=(24 25 26 27 34 35 36 37)
+
+    movq        mm3, mm0                ; transpose coefficients(phase 3)
+    punpckldq   mm0, mm1                ; mm0=(00 01 02 03 04 05 06 07)
+    punpckhdq   mm3, mm1                ; mm3=(10 11 12 13 14 15 16 17)
+    movq        mm4, mm7                ; transpose coefficients(phase 3)
+    punpckldq   mm7, mm5                ; mm7=(20 21 22 23 24 25 26 27)
+    punpckhdq   mm4, mm5                ; mm4=(30 31 32 33 34 35 36 37)
+
+    pushpic     ebx                     ; save GOT address
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
+    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+
+    poppic      ebx                     ; restore GOT address
+
+    add         esi, byte 4*SIZEOF_JCOEF     ; wsptr
+    add         edi, byte 4*SIZEOF_JSAMPROW
+    dec         ecx                          ; ctr
+    jnz         near .rowloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctint-sse2.asm b/media/libjpeg/simd/i386/jidctint-sse2.asm
new file mode 100644
index 0000000000..43e320189b
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctint-sse2.asm
@@ -0,0 +1,858 @@
+;
+; jidctint.asm - accurate integer IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; jpeg_component_info *compptr
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F130_F054)]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm2=z3L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F117_F078)]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm3=[PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm1=[PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm7=[PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1)]  ; xmm2=[PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_F130_F054)]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F130_F054)]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F054_MF130)]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm0=z3L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF078_F117)]  ; xmm7=z3H
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F117_F078)]   ; xmm5=z4L
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F117_F078)]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF089)]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_MF089_F060)]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF050_MF256)]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF256_F050)]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm1=[PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm2=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm5=[PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P2)]  ; xmm7=[PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm5=[PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
+    mov         edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
+    mov         edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctred-mmx.asm b/media/libjpeg/simd/i386/jidctred-mmx.asm
new file mode 100644
index 0000000000..e2307e1cb6
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-mmx.asm
@@ -0,0 +1,704 @@
+;
+; jidctred.asm - reduced-size IDCT (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_mmx)
+
+EXTN(jconst_idct_red_mmx):
+
+PW_F184_MF076   times 2 dw  F_1_847, -F_0_765
+PW_F256_F089    times 2 dw  F_2_562,  F_0_899
+PW_F106_MF217   times 2 dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 2 dw -F_0_601, -F_0_509
+PW_F145_MF021   times 2 dw  F_1_451, -F_0_211
+PW_F362_MF127   times 2 dw  F_3_624, -F_1_272
+PW_F085_MF072   times 2 dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
+                                        ; mmword wk[WK_NUM]
+%define WK_NUM         2
+%define workspace      wk(0) - DCTSIZE2 * SIZEOF_JCOEF
+                                        ; JCOEF workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_mmx)
+
+EXTN(jsimd_idct_4x4_mmx):
+    push        ebp
+    mov         eax, esp                    ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
+    mov         [esp], eax
+    mov         ebp, esp                    ; ebp = aligned ebp
+    lea         esp, [workspace]
+    pushpic     ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+    lea         edi, [workspace]                 ; JCOEF *wsptr
+    mov         ecx, DCTSIZE/4                   ; ctr
+    alignx      16, 7
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         mm0, mm1
+    packsswb    mm0, mm0
+    movd        eax, mm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       mm0, PASS1_BITS
+
+    movq        mm2, mm0                ; mm0=in0=(00 01 02 03)
+    punpcklwd   mm0, mm0                ; mm0=(00 00 01 01)
+    punpckhwd   mm2, mm2                ; mm2=(02 02 03 03)
+
+    movq        mm1, mm0
+    punpckldq   mm0, mm0                ; mm0=(00 00 00 00)
+    punpckhdq   mm1, mm1                ; mm1=(01 01 01 01)
+    movq        mm3, mm2
+    punpckldq   mm2, mm2                ; mm2=(02 02 02 02)
+    punpckhdq   mm3, mm3                ; mm3=(03 03 03 03)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+    jmp         near .nextcolumn
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm6=[PD_DESCALE_P1_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P1_4
+    psrad       mm2, DESCALE_P1_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 01 02 03)
+    packssdw    mm5, mm3                ; mm5=data3=(30 31 32 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; mm7=[PD_DESCALE_P1_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P1_4
+    psrad       mm0, DESCALE_P1_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P1_4
+    psrad       mm3, DESCALE_P1_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(10 11 12 13)
+    packssdw    mm2, mm3                ; mm2=data2=(20 21 22 23)
+
+    movq        mm6, mm1                ; transpose coefficients(phase 1)
+    punpcklwd   mm1, mm4                ; mm1=(00 10 01 11)
+    punpckhwd   mm6, mm4                ; mm6=(02 12 03 13)
+    movq        mm7, mm2                ; transpose coefficients(phase 1)
+    punpcklwd   mm2, mm5                ; mm2=(20 30 21 31)
+    punpckhwd   mm7, mm5                ; mm7=(22 32 23 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpckldq   mm1, mm2                ; mm1=(00 10 20 30)
+    punpckhdq   mm0, mm2                ; mm0=(01 11 21 31)
+    movq        mm3, mm6                ; transpose coefficients(phase 2)
+    punpckldq   mm6, mm7                ; mm6=(02 12 22 32)
+    punpckhdq   mm3, mm7                ; mm3=(03 13 23 33)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
+
+.nextcolumn:
+    add         esi, byte 4*SIZEOF_JCOEF            ; coef_block
+    add         edx, byte 4*SIZEOF_ISLOW_MULT_TYPE  ; quantptr
+    add         edi, byte 4*DCTSIZE*SIZEOF_JCOEF    ; wsptr
+    dec         ecx                                 ; ctr
+    jnz         near .columnloop
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         eax, [original_ebp]
+    lea         esi, [workspace]                   ; JCOEF *wsptr
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+
+    movq        mm4, mm0
+    movq        mm5, mm0
+    punpcklwd   mm4, mm1
+    punpckhwd   mm5, mm1
+    movq        mm0, mm4
+    movq        mm1, mm5
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F256_F089)]   ; mm4=(tmp2L)
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F256_F089)]   ; mm5=(tmp2H)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F106_MF217)]  ; mm0=(tmp0L)
+    pmaddwd     mm1, [GOTOFF(ebx,PW_F106_MF217)]  ; mm1=(tmp0H)
+
+    movq        mm6, mm2
+    movq        mm7, mm2
+    punpcklwd   mm6, mm3
+    punpckhwd   mm7, mm3
+    movq        mm2, mm6
+    movq        mm3, mm7
+    pmaddwd     mm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm6=(tmp2L)
+    pmaddwd     mm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; mm7=(tmp2H)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F145_MF021)]   ; mm2=(tmp0L)
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F145_MF021)]   ; mm3=(tmp0H)
+
+    paddd       mm6, mm4                ; mm6=tmp2L
+    paddd       mm7, mm5                ; mm7=tmp2H
+    paddd       mm2, mm0                ; mm2=tmp0L
+    paddd       mm3, mm1                ; mm3=tmp0H
+
+    movq        MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
+    movq        MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movq        mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movq        mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+
+    pxor        mm1, mm1
+    pxor        mm2, mm2
+    punpcklwd   mm1, mm4                ; mm1=tmp0L
+    punpckhwd   mm2, mm4                ; mm2=tmp0H
+    psrad       mm1, (16-CONST_BITS-1)  ; psrad mm1,16 & pslld mm1,CONST_BITS+1
+    psrad       mm2, (16-CONST_BITS-1)  ; psrad mm2,16 & pslld mm2,CONST_BITS+1
+
+    movq        mm3, mm5                ; mm5=in2=z2
+    punpcklwd   mm5, mm0                ; mm0=in6=z3
+    punpckhwd   mm3, mm0
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F184_MF076)]  ; mm5=tmp2L
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F184_MF076)]  ; mm3=tmp2H
+
+    movq        mm4, mm1
+    movq        mm0, mm2
+    paddd       mm1, mm5                ; mm1=tmp10L
+    paddd       mm2, mm3                ; mm2=tmp10H
+    psubd       mm4, mm5                ; mm4=tmp12L
+    psubd       mm0, mm3                ; mm0=tmp12H
+
+    ; -- Final output stage
+
+    movq        mm5, mm1
+    movq        mm3, mm2
+    paddd       mm1, mm6                ; mm1=data0L
+    paddd       mm2, mm7                ; mm2=data0H
+    psubd       mm5, mm6                ; mm5=data3L
+    psubd       mm3, mm7                ; mm3=data3H
+
+    movq        mm6, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm6=[PD_DESCALE_P2_4]
+
+    paddd       mm1, mm6
+    paddd       mm2, mm6
+    psrad       mm1, DESCALE_P2_4
+    psrad       mm2, DESCALE_P2_4
+    paddd       mm5, mm6
+    paddd       mm3, mm6
+    psrad       mm5, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm1, mm2                ; mm1=data0=(00 10 20 30)
+    packssdw    mm5, mm3                ; mm5=data3=(03 13 23 33)
+
+    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp0L
+    movq        mm6, MMWORD [wk(1)]     ; mm6=tmp0H
+
+    movq        mm2, mm4
+    movq        mm3, mm0
+    paddd       mm4, mm7                ; mm4=data1L
+    paddd       mm0, mm6                ; mm0=data1H
+    psubd       mm2, mm7                ; mm2=data2L
+    psubd       mm3, mm6                ; mm3=data2H
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; mm7=[PD_DESCALE_P2_4]
+
+    paddd       mm4, mm7
+    paddd       mm0, mm7
+    psrad       mm4, DESCALE_P2_4
+    psrad       mm0, DESCALE_P2_4
+    paddd       mm2, mm7
+    paddd       mm3, mm7
+    psrad       mm2, DESCALE_P2_4
+    psrad       mm3, DESCALE_P2_4
+
+    packssdw    mm4, mm0                ; mm4=data1=(01 11 21 31)
+    packssdw    mm2, mm3                ; mm2=data2=(02 12 22 32)
+
+    movq        mm6, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm6=[PB_CENTERJSAMP]
+
+    packsswb    mm1, mm2                ; mm1=(00 10 20 30 02 12 22 32)
+    packsswb    mm4, mm5                ; mm4=(01 11 21 31 03 13 23 33)
+    paddb       mm1, mm6
+    paddb       mm4, mm6
+
+    movq        mm7, mm1                ; transpose coefficients(phase 1)
+    punpcklbw   mm1, mm4                ; mm1=(00 01 10 11 20 21 30 31)
+    punpckhbw   mm7, mm4                ; mm7=(02 03 12 13 22 23 32 33)
+
+    movq        mm0, mm1                ; transpose coefficients(phase 2)
+    punpcklwd   mm1, mm7                ; mm1=(00 01 02 03 10 11 12 13)
+    punpckhwd   mm0, mm7                ; mm0=(20 21 22 23 30 31 32 33)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    psrlq       mm1, 4*BYTE_BIT
+    psrlq       mm0, 4*BYTE_BIT
+
+    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        dword [edx+eax*SIZEOF_JSAMPLE], mm1
+    movd        dword [esi+eax*SIZEOF_JSAMPLE], mm0
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_mmx(void *dct_table, JCOEFPTR coef_block,
+;                    JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_mmx)
+
+EXTN(jsimd_idct_2x2_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movq        mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movq        mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
+    ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
+
+    pcmpeqd     mm7, mm7
+    pslld       mm7, WORD_BIT           ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
+
+    movq        mm4, mm0                ; mm4=(10 11 ** 13)
+    movq        mm5, mm2                ; mm5=(50 51 ** 53)
+    punpcklwd   mm4, mm1                ; mm4=(10 30 11 31)
+    punpcklwd   mm5, mm3                ; mm5=(50 70 51 71)
+    pmaddwd     mm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       mm0, WORD_BIT           ; mm0=(11 -- 13 --)
+    pand        mm1, mm7                ; mm1=(-- 31 -- 33)
+    psrld       mm2, WORD_BIT           ; mm2=(51 -- 53 --)
+    pand        mm3, mm7                ; mm3=(-- 71 -- 73)
+    por         mm0, mm1                ; mm0=(11 31 13 33)
+    por         mm2, mm3                ; mm2=(51 71 53 73)
+    pmaddwd     mm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm4, mm5                ; mm4=tmp0[col0 col1]
+
+    movq        mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
+    movq        mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movq        mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
+    ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
+
+    psrld       mm6, WORD_BIT           ; mm6=(15 -- 17 --)
+    pand        mm1, mm7                ; mm1=(-- 35 -- 37)
+    psrld       mm3, WORD_BIT           ; mm3=(55 -- 57 --)
+    pand        mm5, mm7                ; mm5=(-- 75 -- 77)
+    por         mm6, mm1                ; mm6=(15 35 17 37)
+    por         mm3, mm5                ; mm3=(55 75 57 77)
+    pmaddwd     mm6, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm3, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm0, mm2                ; mm0=tmp0[col1 col3]
+    paddd       mm6, mm3                ; mm6=tmp0[col5 col7]
+
+    ; -- Even part
+
+    movq        mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movq        mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
+    pmullw      mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
+
+    movq        mm2, mm1                      ; mm2=(00 01 ** 03)
+    pslld       mm1, WORD_BIT                 ; mm1=(-- 00 -- **)
+    psrad       mm1, (WORD_BIT-CONST_BITS-2)  ; mm1=tmp10[col0 ****]
+
+    pand        mm2, mm7                      ; mm2=(-- 01 -- 03)
+    pand        mm5, mm7                      ; mm5=(-- 05 -- 07)
+    psrad       mm2, (WORD_BIT-CONST_BITS-2)  ; mm2=tmp10[col1 col3]
+    psrad       mm5, (WORD_BIT-CONST_BITS-2)  ; mm5=tmp10[col5 col7]
+
+    ; -- Final output stage
+
+    movq        mm3, mm1
+    paddd       mm1, mm4                ; mm1=data0[col0 ****]=(A0 **)
+    psubd       mm3, mm4                ; mm3=data1[col0 ****]=(B0 **)
+    punpckldq   mm1, mm3                ; mm1=(A0 B0)
+
+    movq        mm7, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; mm7=[PD_DESCALE_P1_2]
+
+    movq        mm4, mm2
+    movq        mm3, mm5
+    paddd       mm2, mm0                ; mm2=data0[col1 col3]=(A1 A3)
+    paddd       mm5, mm6                ; mm5=data0[col5 col7]=(A5 A7)
+    psubd       mm4, mm0                ; mm4=data1[col1 col3]=(B1 B3)
+    psubd       mm3, mm6                ; mm3=data1[col5 col7]=(B5 B7)
+
+    paddd       mm1, mm7
+    psrad       mm1, DESCALE_P1_2
+
+    paddd       mm2, mm7
+    paddd       mm5, mm7
+    psrad       mm2, DESCALE_P1_2
+    psrad       mm5, DESCALE_P1_2
+    paddd       mm4, mm7
+    paddd       mm3, mm7
+    psrad       mm4, DESCALE_P1_2
+    psrad       mm3, DESCALE_P1_2
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    mm2, mm4                ; mm2=(A1 A3 B1 B3)
+    packssdw    mm5, mm3                ; mm5=(A5 A7 B5 B7)
+    pmaddwd     mm2, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     mm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       mm2, mm5                ; mm2=tmp0[row0 row1]
+
+    ; -- Even part
+
+    pslld       mm1, (CONST_BITS+2)     ; mm1=tmp10[row0 row1]
+
+    ; -- Final output stage
+
+    movq        mm0, [GOTOFF(ebx,PD_DESCALE_P2_2)]  ; mm0=[PD_DESCALE_P2_2]
+
+    movq        mm6, mm1
+    paddd       mm1, mm2                ; mm1=data0[row0 row1]=(C0 C1)
+    psubd       mm6, mm2                ; mm6=data1[row0 row1]=(D0 D1)
+
+    paddd       mm1, mm0
+    paddd       mm6, mm0
+    psrad       mm1, DESCALE_P2_2
+    psrad       mm6, DESCALE_P2_2
+
+    movq        mm7, mm1                ; transpose coefficients
+    punpckldq   mm1, mm6                ; mm1=(C0 D0)
+    punpckhdq   mm7, mm6                ; mm7=(C1 D1)
+
+    packssdw    mm1, mm7                ; mm1=(C0 D0 C1 D1)
+    packsswb    mm1, mm1                ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
+    paddb       mm1, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    movd        ecx, mm1
+    movd        ebx, mm1                ; ebx=(C0 D0 C1 D1)
+    shr         ecx, 2*BYTE_BIT         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jidctred-sse2.asm b/media/libjpeg/simd/i386/jidctred-sse2.asm
new file mode 100644
index 0000000000..6e56494e97
--- /dev/null
+++ b/media/libjpeg/simd/i386/jidctred-sse2.asm
@@ -0,0 +1,592 @@
+;
+; jidctred.asm - reduced-size IDCT (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+%define original_ebp   ebp + 0
+%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM         2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        ebp
+    mov         eax, esp                     ; eax = original ebp
+    sub         esp, byte 4
+    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [esp], eax
+    mov         ebp, esp                     ; ebp = aligned ebp
+    lea         esp, [wk(0)]
+    pushpic     ebx
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+;   mov         eax, [original_ebp]
+    mov         edx, POINTER [dct_table(eax)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        eax, eax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+    alignx      16, 7
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F256_F089)]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F256_F089)]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F106_MF217)]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm6=[PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [GOTOFF(ebx,PD_DESCALE_P1_4)]  ; xmm7=[PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         eax, [original_ebp]
+    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(eax)]
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F256_F089)]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [GOTOFF(ebx,PW_MF060_MF050)]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F106_MF217)]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F145_MF021)]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F184_MF076)]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [GOTOFF(ebx,PD_DESCALE_P2_4)]  ; xmm1=[PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
+    mov         edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+    poppic      ebx
+    mov         esp, ebp                ; esp <- aligned ebp
+    pop         esp                     ; esp <- original ebp
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+%define dct_table(b)   (b) + 8          ; void *dct_table
+%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
+%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
+%define output_col(b)  (b) + 20         ; JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    get_GOT     ebx                     ; get GOT address
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         edx, POINTER [dct_table(ebp)]    ; quantptr
+    mov         esi, JCOEFPTR [coef_block(ebp)]  ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm5, [GOTOFF(ebx,PW_F085_MF072)]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm2, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [GOTOFF(ebx,PD_DESCALE_P1_2)]  ; xmm2=[PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         edi, JSAMPARRAY [output_buf(ebp)]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [output_col(ebp)]
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [GOTOFF(ebx,PW_F362_MF127)]
+    pmaddwd     xmm7, [GOTOFF(ebx,PW_F085_MF072)]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [GOTOFF(ebx,PD_DESCALE_P2_2)]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [GOTOFF(ebx,PB_CENTERJSAMP)]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+    mov         esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+    mov         word [edx+eax*SIZEOF_JSAMPLE], bx
+    mov         word [esi+eax*SIZEOF_JSAMPLE], cx
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-3dn.asm b/media/libjpeg/simd/i386/jquant-3dn.asm
new file mode 100644
index 0000000000..5cb60caa94
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-3dn.asm
@@ -0,0 +1,230 @@
+;
+; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_3dnow(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                            FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_3dnow)
+
+EXTN(jsimd_convsamp_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    pi2fd       mm4, mm4
+    pi2fd       mm2, mm2
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    pi2fd       mm5, mm5
+    pi2fd       mm0, mm0
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
+    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
+    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    pi2fd       mm6, mm6
+    pi2fd       mm3, mm3
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    pi2fd       mm4, mm4
+    pi2fd       mm1, mm1
+
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
+    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
+    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_3dnow(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                            FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_3dnow)
+
+EXTN(jsimd_quantize_float_3dnow):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         eax, 0x4B400000         ; (float)0x00C00000 (rndint_magic)
+    movd        mm7, eax
+    punpckldq   mm7, mm7                ; mm7={12582912.0F 12582912.0F}
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm0, mm7                ; mm0=(00 ** 01 **)
+    pfadd       mm1, mm7                ; mm1=(02 ** 03 **)
+    pfadd       mm2, mm7                ; mm0=(04 ** 05 **)
+    pfadd       mm3, mm7                ; mm1=(06 ** 07 **)
+
+    movq        mm4, mm0
+    punpcklwd   mm0, mm1                ; mm0=(00 02 ** **)
+    punpckhwd   mm4, mm1                ; mm4=(01 03 ** **)
+    movq        mm5, mm2
+    punpcklwd   mm2, mm3                ; mm2=(04 06 ** **)
+    punpckhwd   mm5, mm3                ; mm5=(05 07 ** **)
+
+    punpcklwd   mm0, mm4                ; mm0=(00 01 02 03)
+    punpcklwd   mm2, mm5                ; mm2=(04 05 06 07)
+
+    movq        mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+    movq        mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
+    movq        mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
+    pfmul       mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
+    pfmul       mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
+
+    pfadd       mm6, mm7                ; mm0=(10 ** 11 **)
+    pfadd       mm1, mm7                ; mm4=(12 ** 13 **)
+    pfadd       mm3, mm7                ; mm0=(14 ** 15 **)
+    pfadd       mm4, mm7                ; mm4=(16 ** 17 **)
+
+    movq        mm5, mm6
+    punpcklwd   mm6, mm1                ; mm6=(10 12 ** **)
+    punpckhwd   mm5, mm1                ; mm5=(11 13 ** **)
+    movq        mm1, mm3
+    punpcklwd   mm3, mm4                ; mm3=(14 16 ** **)
+    punpckhwd   mm1, mm4                ; mm1=(15 17 ** **)
+
+    punpcklwd   mm6, mm5                ; mm6=(10 11 12 13)
+    punpcklwd   mm3, mm1                ; mm3=(14 15 16 17)
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    femms                               ; empty MMX/3DNow! state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-mmx.asm b/media/libjpeg/simd/i386/jquant-mmx.asm
new file mode 100644
index 0000000000..61305c625d
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-mmx.asm
@@ -0,0 +1,276 @@
+;
+; jquant.asm - sample data conversion and quantization (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                    DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_mmx)
+
+EXTN(jsimd_convsamp_mmx):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        mm6, mm6                ; mm6=(all 0's)
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7                  ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm0=(01234567)
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]   ; mm2=(GHIJKLMN)
+    movq        mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]   ; mm3=(OPQRSTUV)
+
+    movq        mm4, mm0
+    punpcklbw   mm0, mm6                ; mm0=(0123)
+    punpckhbw   mm4, mm6                ; mm4=(4567)
+    movq        mm5, mm1
+    punpcklbw   mm1, mm6                ; mm1=(89AB)
+    punpckhbw   mm5, mm6                ; mm5=(CDEF)
+
+    paddw       mm0, mm7
+    paddw       mm4, mm7
+    paddw       mm1, mm7
+    paddw       mm5, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
+
+    movq        mm0, mm2
+    punpcklbw   mm2, mm6                ; mm2=(GHIJ)
+    punpckhbw   mm0, mm6                ; mm0=(KLMN)
+    movq        mm4, mm3
+    punpcklbw   mm3, mm6                ; mm3=(OPQR)
+    punpckhbw   mm4, mm6                ; mm4=(STUV)
+
+    paddw       mm2, mm7
+    paddw       mm0, mm7
+    paddw       mm3, mm7
+    paddw       mm4, mm7
+
+    movq        MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
+    movq        MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
+    movq        MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors,
+;                    DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SHIFT(m, n, b) \
+  MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_mmx)
+
+EXTN(jsimd_quantize_mmx):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         ah, 2
+    alignx      16, 7
+.quantloop1:
+    mov         al, DCTSIZE2/8/2
+    alignx      16, 7
+.quantloop2:
+    movq        mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movq        mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
+
+    movq        mm0, mm2
+    movq        mm1, mm3
+
+    psraw       mm2, (WORD_BIT-1)       ; -1 if value < 0, 0 otherwise
+    psraw       mm3, (WORD_BIT-1)
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    ;
+    ; MMX is an annoyingly crappy instruction set. It has two
+    ; misfeatures that are causing problems here:
+    ;
+    ; - All multiplications are signed.
+    ;
+    ; - The second operand for the shifts is not treated as packed.
+    ;
+    ;
+    ; We work around the first problem by implementing this algorithm:
+    ;
+    ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
+    ; {
+    ;   enum { SHORT_BIT = 16 };
+    ;   signed short sx = (signed short)x;
+    ;   signed short sy = (signed short)y;
+    ;   signed long sz;
+    ;
+    ;   sz = (long)sx * (long)sy;    /* signed multiply */
+    ;
+    ;   if (sx < 0) sz += (long)sy << SHORT_BIT;
+    ;   if (sy < 0) sz += (long)sx << SHORT_BIT;
+    ;
+    ;   return (unsigned long)sz;
+    ; }
+    ;
+    ; (note that a negative sx adds _sy_ and vice versa)
+    ;
+    ; For the second problem, we replace the shift by a multiplication.
+    ; Unfortunately that means we have to deal with the signed issue again.
+    ;
+
+    paddw       mm0, MMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       mm1, MMWORD [CORRECTION(0,1,edx)]
+
+    movq        mm4, mm0                ; store current value for later
+    movq        mm5, mm1
+    pmulhw      mm0, MMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhw      mm1, MMWORD [RECIPROCAL(0,1,edx)]
+    paddw       mm0, mm4  ; reciprocal is always negative (MSB=1),
+    paddw       mm1, mm5  ; so we always need to add the initial value
+                          ; (input value is never negative as we
+                          ; inverted it at the start of this routine)
+
+    ; here it gets a bit tricky as both scale
+    ; and mm0/mm1 can be negative
+    movq        mm6, MMWORD [SCALE(0,0,edx)]  ; scale
+    movq        mm7, MMWORD [SCALE(0,1,edx)]
+    movq        mm4, mm0
+    movq        mm5, mm1
+    pmulhw      mm0, mm6
+    pmulhw      mm1, mm7
+
+    psraw       mm6, (WORD_BIT-1)       ; determine if scale is negative
+    psraw       mm7, (WORD_BIT-1)
+
+    pand        mm6, mm4                ; and add input if it is
+    pand        mm7, mm5
+    paddw       mm0, mm6
+    paddw       mm1, mm7
+
+    psraw       mm4, (WORD_BIT-1)       ; then check if negative input
+    psraw       mm5, (WORD_BIT-1)
+
+    pand        mm4, MMWORD [SCALE(0,0,edx)]  ; and add scale if it is
+    pand        mm5, MMWORD [SCALE(0,1,edx)]
+    paddw       mm0, mm4
+    paddw       mm1, mm5
+
+    pxor        mm0, mm2                ; val = -val
+    pxor        mm1, mm3
+    psubw       mm0, mm2
+    psubw       mm1, mm3
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
+
+    add         esi, byte 8*SIZEOF_DCTELEM
+    add         edx, byte 8*SIZEOF_DCTELEM
+    add         edi, byte 8*SIZEOF_JCOEF
+    dec         al
+    jnz         near .quantloop2
+    dec         ah
+    jnz         near .quantloop1        ; to avoid branch misprediction
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquant-sse.asm b/media/libjpeg/simd/i386/jquant-sse.asm
new file mode 100644
index 0000000000..218adc976f
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquant-sse.asm
@@ -0,0 +1,208 @@
+;
+; jquant.asm - sample data conversion and quantization (SSE & MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                          FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
+
+EXTN(jsimd_convsamp_float_sse):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     mm7, mm7
+    psllw       mm7, 7
+    packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       mm0, mm7                ; mm0=(01234567)
+    psubb       mm1, mm7                ; mm1=(89ABCDEF)
+
+    punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
+    punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
+    punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
+    punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
+
+    punpcklwd   mm4, mm2                ; mm4=(***0***1)
+    punpckhwd   mm2, mm2                ; mm2=(***2***3)
+    punpcklwd   mm5, mm0                ; mm5=(***4***5)
+    punpckhwd   mm0, mm0                ; mm0=(***6***7)
+
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
+    psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
+    cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
+    cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
+    psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
+    psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
+    cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
+    cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
+
+    punpcklwd   mm6, mm3                ; mm6=(***8***9)
+    punpckhwd   mm3, mm3                ; mm3=(***A***B)
+    punpcklwd   mm4, mm1                ; mm4=(***C***D)
+    punpckhwd   mm1, mm1                ; mm1=(***E***F)
+
+    psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
+    psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
+    cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
+    cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
+    psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
+    psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
+    cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
+    cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
+
+    movlhps     xmm0, xmm1              ; xmm0=(0123)
+    movlhps     xmm2, xmm3              ; xmm2=(4567)
+    movlhps     xmm4, xmm5              ; xmm4=(89AB)
+    movlhps     xmm6, xmm7              ; xmm6=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         near .convloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                          FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse)
+
+EXTN(jsimd_quantize_float_sse):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    movhlps     xmm4, xmm0
+    movhlps     xmm5, xmm1
+
+    cvtps2pi    mm0, xmm0
+    cvtps2pi    mm1, xmm1
+    cvtps2pi    mm4, xmm4
+    cvtps2pi    mm5, xmm5
+
+    movhlps     xmm6, xmm2
+    movhlps     xmm7, xmm3
+
+    cvtps2pi    mm2, xmm2
+    cvtps2pi    mm3, xmm3
+    cvtps2pi    mm6, xmm6
+    cvtps2pi    mm7, xmm7
+
+    packssdw    mm0, mm4
+    packssdw    mm1, mm5
+    packssdw    mm2, mm6
+    packssdw    mm3, mm7
+
+    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    emms                                ; empty MMX state
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquantf-sse2.asm b/media/libjpeg/simd/i386/jquantf-sse2.asm
new file mode 100644
index 0000000000..a881ab50f9
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquantf-sse2.asm
@@ -0,0 +1,168 @@
+;
+; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/2
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         esi, byte 2*SIZEOF_JSAMPROW
+    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; FAST_FLOAT *divisors
+%define workspace   ebp + 16            ; FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/16
+    alignx      16, 7
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
+
+    add         esi, byte 16*SIZEOF_FAST_FLOAT
+    add         edx, byte 16*SIZEOF_FAST_FLOAT
+    add         edi, byte 16*SIZEOF_JCOEF
+    dec         eax
+    jnz         short .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquanti-avx2.asm b/media/libjpeg/simd/i386/jquanti-avx2.asm
new file mode 100644
index 0000000000..5ed6bec246
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-avx2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
+
+    vinserti128 ymm0, ymm0, xmm1, 1
+    vinserti128 ymm2, ymm2, xmm3, 1
+    vinserti128 ymm4, ymm4, xmm5, 1
+    vinserti128 ymm6, ymm6, xmm7, 1
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpunpcklbw  ymm0, ymm0, ymm1
+    vpunpcklbw  ymm2, ymm2, ymm1
+    vpunpcklbw  ymm4, ymm4, ymm1
+    vpunpcklbw  ymm6, ymm6, ymm1
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm4, ymm4, ymm7
+    vpaddw      ymm6, ymm6, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jquanti-sse2.asm b/media/libjpeg/simd/i386/jquanti-sse2.asm
new file mode 100644
index 0000000000..0a509408aa
--- /dev/null
+++ b/media/libjpeg/simd/i386/jquanti-sse2.asm
@@ -0,0 +1,201 @@
+;
+; jquanti.asm - sample data conversion and quantization (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
+%define start_col    ebp + 12           ; JDIMENSION start_col
+%define workspace    ebp + 16           ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        ebp
+    mov         ebp, esp
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
+    mov         eax, JDIMENSION [start_col]
+    mov         edi, POINTER [workspace]       ; (DCTELEM *)
+    mov         ecx, DCTSIZE/4
+    alignx      16, 7
+.convloop:
+    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 4*SIZEOF_JSAMPROW
+    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         ecx
+    jnz         short .convloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    pop         ebp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+%define coef_block  ebp + 8             ; JCOEFPTR coef_block
+%define divisors    ebp + 12            ; DCTELEM *divisors
+%define workspace   ebp + 16            ; DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        ebp
+    mov         ebp, esp
+;   push        ebx                     ; unused
+;   push        ecx                     ; unused
+;   push        edx                     ; need not be preserved
+    push        esi
+    push        edi
+
+    mov         esi, POINTER [workspace]
+    mov         edx, POINTER [divisors]
+    mov         edi, JCOEFPTR [coef_block]
+    mov         eax, DCTSIZE2/32
+    alignx      16, 7
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
+
+    add         esi, byte 32*SIZEOF_DCTELEM
+    add         edx, byte 32*SIZEOF_DCTELEM
+    add         edi, byte 32*SIZEOF_JCOEF
+    dec         eax
+    jnz         near .quantloop
+
+    pop         edi
+    pop         esi
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; unused
+;   pop         ebx                     ; unused
+    pop         ebp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/i386/jsimd.c b/media/libjpeg/simd/i386/jsimd.c
new file mode 100644
index 0000000000..80bc821ff4
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimd.c
@@ -0,0 +1,1246 @@
+/*
+ * jsimd_i386.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 32-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((unsigned)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCEMMX") && !strcmp(env, "1"))
+    simd_support &= JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCE3DNOW") && !strcmp(env, "1"))
+    simd_support &= JSIMD_3DNOW | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE | JSIMD_MMX;
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extrgb_ycc_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    mmxfct = jsimd_extrgbx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    mmxfct = jsimd_extbgrx_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    mmxfct = jsimd_extxbgr_ycc_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    mmxfct = jsimd_extxrgb_ycc_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    mmxfct = jsimd_rgb_ycc_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*mmxfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    mmxfct = jsimd_extrgb_gray_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    mmxfct = jsimd_extrgbx_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    mmxfct = jsimd_extbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    mmxfct = jsimd_extbgrx_gray_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    mmxfct = jsimd_extxbgr_gray_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    mmxfct = jsimd_extxrgb_gray_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    mmxfct = jsimd_rgb_gray_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extrgb_convert_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    mmxfct = jsimd_ycc_extrgbx_convert_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extbgr_convert_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    mmxfct = jsimd_ycc_extbgrx_convert_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    mmxfct = jsimd_ycc_extxbgr_convert_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    mmxfct = jsimd_ycc_extxrgb_convert_mmx;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    mmxfct = jsimd_ycc_rgb_convert_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    mmxfct = jsimd_h2v2_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgb_merged_upsample_mmx;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extrgbx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extbgrx_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxbgr_merged_upsample_mmx;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_extxrgb_merged_upsample_mmx;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    mmxfct = jsimd_h2v1_merged_upsample_mmx;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else if (simd_support & JSIMD_SSE2)
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_mmx(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_fdct_islow_sse2(data);
+  else
+    jsimd_fdct_islow_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    jsimd_fdct_ifast_sse2(data);
+  else
+    jsimd_fdct_ifast_mmx(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    jsimd_fdct_float_sse(data);
+  else if (simd_support & JSIMD_3DNOW)
+    jsimd_fdct_float_3dnow(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+  if (simd_support & JSIMD_SSE)
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_mmx(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  if (simd_support & JSIMD_SSE2)
+    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+  else if (simd_support & JSIMD_SSE)
+    jsimd_quantize_float_sse(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+  else
+    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+  if (simd_support & JSIMD_MMX)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    return 1;
+  if (simd_support & JSIMD_3DNOW)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if (simd_support & JSIMD_SSE2)
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
+    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
+                         output_col);
+  else
+    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 4)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/i386/jsimdcpu.asm b/media/libjpeg/simd/i386/jsimdcpu.asm
new file mode 100644
index 0000000000..ddcafa9e21
--- /dev/null
+++ b/media/libjpeg/simd/i386/jsimdcpu.asm
@@ -0,0 +1,135 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        32
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        ebx
+;   push        ecx                     ; need not be preserved
+;   push        edx                     ; need not be preserved
+;   push        esi                     ; unused
+    push        edi
+
+    xor         edi, edi                ; simd support flag
+
+    pushfd
+    pop         eax
+    mov         edx, eax
+    xor         eax, 1<<21              ; flip ID bit in EFLAGS
+    push        eax
+    popfd
+    pushfd
+    pop         eax
+    xor         eax, edx
+    jz          near .return            ; CPUID is not supported
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    xor         eax, eax
+    cpuid
+    test        eax, eax
+    jz          near .return
+    cmp         eax, 7
+    jl          short .no_avx2          ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         eax, 7
+    xor         ecx, ecx
+    cpuid
+    mov         eax, ebx
+    test        eax, 1<<5               ; bit5:AVX2
+    jz          short .no_avx2
+
+    ; Check for AVX2 O/S support
+    mov         eax, 1
+    xor         ecx, ecx
+    cpuid
+    test        ecx, 1<<27
+    jz          short .no_avx2          ; O/S does not support XSAVE
+    test        ecx, 1<<28
+    jz          short .no_avx2          ; CPU does not support AVX2
+
+    xor         ecx, ecx
+    xgetbv
+    and         eax, 6
+    cmp         eax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .no_avx2
+
+    or          edi, JSIMD_AVX2
+.no_avx2:
+
+    ; Check CPUID leaf 01H for MMX, SSE, and SSE2 support
+    xor         eax, eax
+    inc         eax
+    cpuid
+    mov         eax, edx                ; eax = Standard feature flags
+
+    ; Check for MMX instruction support
+    test        eax, 1<<23              ; bit23:MMX
+    jz          short .no_mmx
+    or          edi, byte JSIMD_MMX
+.no_mmx:
+    test        eax, 1<<25              ; bit25:SSE
+    jz          short .no_sse
+    or          edi, byte JSIMD_SSE
+.no_sse:
+    test        eax, 1<<26              ; bit26:SSE2
+    jz          short .no_sse2
+    or          edi, byte JSIMD_SSE2
+.no_sse2:
+
+    ; Check for 3DNow! instruction support
+    mov         eax, 0x80000000
+    cpuid
+    cmp         eax, 0x80000000
+    jbe         short .return
+
+    mov         eax, 0x80000001
+    cpuid
+    mov         eax, edx                ; eax = Extended feature flags
+
+    test        eax, 1<<31              ; bit31:3DNow!(vendor independent)
+    jz          short .no_3dnow
+    or          edi, byte JSIMD_3DNOW
+.no_3dnow:
+
+.return:
+    mov         eax, edi
+
+    pop         edi
+;   pop         esi                     ; unused
+;   pop         edx                     ; need not be preserved
+;   pop         ecx                     ; need not be preserved
+    pop         ebx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/jccolext-mmx.asm b/media/libjpeg/simd/jccolext-mmx.asm
deleted file mode 100644
index 96a0372b1b..0000000000
--- a/media/libjpeg/simd/jccolext-mmx.asm
+++ /dev/null
@@ -1,476 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
-;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                           JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          8
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_rgb_ycc_convert_mmx)
-
-EXTN(jsimd_rgb_ycc_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edx
-        push    ebx
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        mov     ebx, JSAMPROW [ebx]     ; outptr1
-        mov     edx, JSAMPROW [edx]     ; outptr2
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        xor     eax,eax
-        mov     al, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        xor     edx,edx
-        mov     dx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    mmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    mmG, DWORD [esi+ecx]
-        psllq   mmA, DWORD_BIT
-        por     mmA,mmG
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        movq    mmG,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        mov     ecx, SIZEOF_MMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld16:
-        test    cl, 2*SIZEOF_MMWORD
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_ycc_cnv
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-        ; mmA=(00 10 20 01 11 21 02 12)
-        ; mmG=(22 03 13 23 04 14 24 05)
-        ; mmF=(15 25 06 16 26 07 17 27)
-
-        movq      mmD,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
-        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
-
-        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
-        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
-
-        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
-        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
-
-        movq      mmE,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
-        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
-
-        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
-
-        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
-        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
-
-        pxor      mmH,mmH
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
-
-        movq      mmB,mmE
-        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
-        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
-
-        movq      mmF,mmD
-        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
-        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_MMWORD/8
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_MMWORD/8
-        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_MMWORD/4
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_MMWORD/4
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-        test    cl, SIZEOF_MMWORD/2
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_ycc_cnv
-        movq    mmD,mmA
-        movq    mmC,mmF
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_ycc_cnv:
-        ; mmA=(00 10 20 30 01 11 21 31)
-        ; mmF=(02 12 22 32 03 13 23 33)
-        ; mmD=(04 14 24 34 05 15 25 35)
-        ; mmC=(06 16 26 36 07 17 27 37)
-
-        movq      mmB,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
-        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
-
-        movq      mmG,mmD
-        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
-        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
-
-        movq      mmE,mmA
-        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
-
-        movq      mmH,mmB
-        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
-        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
-
-        pxor      mmF,mmF
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
-
-        movq      mmD,mmB
-        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
-        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
-
-        movq      mmG,mmE
-        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
-        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
-
-        punpcklbw mmF,mmH
-        punpckhbw mmH,mmH
-        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
-        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movq      MMWORD [wk(0)], mm0   ; wk(0)=RE
-        movq      MMWORD [wk(1)], mm1   ; wk(1)=RO
-        movq      MMWORD [wk(2)], mm4   ; wk(2)=BE
-        movq      MMWORD [wk(3)], mm5   ; wk(3)=BO
-
-        movq      mm6,mm1
-        punpcklwd mm1,mm3
-        punpckhwd mm6,mm3
-        movq      mm7,mm1
-        movq      mm4,mm6
-        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movq      MMWORD [wk(4)], mm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movq      MMWORD [wk(5)], mm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      mm1,mm1
-        pxor      mm6,mm6
-        punpcklwd mm1,mm5               ; mm1=BOL
-        punpckhwd mm6,mm5               ; mm6=BOH
-        psrld     mm1,1                 ; mm1=BOL*FIX(0.500)
-        psrld     mm6,1                 ; mm6=BOH*FIX(0.500)
-
-        movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
-
-        paddd     mm7,mm1
-        paddd     mm4,mm6
-        paddd     mm7,mm5
-        paddd     mm4,mm5
-        psrld     mm7,SCALEBITS         ; mm7=CbOL
-        psrld     mm4,SCALEBITS         ; mm4=CbOH
-        packssdw  mm7,mm4               ; mm7=CbO
-
-        movq      mm1, MMWORD [wk(2)]   ; mm1=BE
-
-        movq      mm6,mm0
-        punpcklwd mm0,mm2
-        punpckhwd mm6,mm2
-        movq      mm5,mm0
-        movq      mm4,mm6
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movq      MMWORD [wk(6)], mm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movq      MMWORD [wk(7)], mm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      mm0,mm0
-        pxor      mm6,mm6
-        punpcklwd mm0,mm1               ; mm0=BEL
-        punpckhwd mm6,mm1               ; mm6=BEH
-        psrld     mm0,1                 ; mm0=BEL*FIX(0.500)
-        psrld     mm6,1                 ; mm6=BEH*FIX(0.500)
-
-        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-        paddd     mm5,mm0
-        paddd     mm4,mm6
-        paddd     mm5,mm1
-        paddd     mm4,mm1
-        psrld     mm5,SCALEBITS         ; mm5=CbEL
-        psrld     mm4,SCALEBITS         ; mm4=CbEH
-        packssdw  mm5,mm4               ; mm5=CbE
-
-        psllw     mm7,BYTE_BIT
-        por       mm5,mm7               ; mm5=Cb
-        movq      MMWORD [ebx], mm5     ; Save Cb
-
-        movq      mm0, MMWORD [wk(3)]   ; mm0=BO
-        movq      mm6, MMWORD [wk(2)]   ; mm6=BE
-        movq      mm1, MMWORD [wk(1)]   ; mm1=RO
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm3
-        punpckhwd mm4,mm3
-        movq      mm7,mm0
-        movq      mm5,mm4
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
-
-        paddd     mm0, MMWORD [wk(4)]
-        paddd     mm4, MMWORD [wk(5)]
-        paddd     mm0,mm3
-        paddd     mm4,mm3
-        psrld     mm0,SCALEBITS         ; mm0=YOL
-        psrld     mm4,SCALEBITS         ; mm4=YOH
-        packssdw  mm0,mm4               ; mm0=YO
-
-        pxor      mm3,mm3
-        pxor      mm4,mm4
-        punpcklwd mm3,mm1               ; mm3=ROL
-        punpckhwd mm4,mm1               ; mm4=ROH
-        psrld     mm3,1                 ; mm3=ROL*FIX(0.500)
-        psrld     mm4,1                 ; mm4=ROH*FIX(0.500)
-
-        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
-
-        paddd     mm7,mm3
-        paddd     mm5,mm4
-        paddd     mm7,mm1
-        paddd     mm5,mm1
-        psrld     mm7,SCALEBITS         ; mm7=CrOL
-        psrld     mm5,SCALEBITS         ; mm5=CrOH
-        packssdw  mm7,mm5               ; mm7=CrO
-
-        movq      mm3, MMWORD [wk(0)]   ; mm3=RE
-
-        movq      mm4,mm6
-        punpcklwd mm6,mm2
-        punpckhwd mm4,mm2
-        movq      mm1,mm6
-        movq      mm5,mm4
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
-
-        paddd     mm6, MMWORD [wk(6)]
-        paddd     mm4, MMWORD [wk(7)]
-        paddd     mm6,mm2
-        paddd     mm4,mm2
-        psrld     mm6,SCALEBITS         ; mm6=YEL
-        psrld     mm4,SCALEBITS         ; mm4=YEH
-        packssdw  mm6,mm4               ; mm6=YE
-
-        psllw     mm0,BYTE_BIT
-        por       mm6,mm0               ; mm6=Y
-        movq      MMWORD [edi], mm6     ; Save Y
-
-        pxor      mm2,mm2
-        pxor      mm4,mm4
-        punpcklwd mm2,mm3               ; mm2=REL
-        punpckhwd mm4,mm3               ; mm4=REH
-        psrld     mm2,1                 ; mm2=REL*FIX(0.500)
-        psrld     mm4,1                 ; mm4=REH*FIX(0.500)
-
-        movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
-
-        paddd     mm1,mm2
-        paddd     mm5,mm4
-        paddd     mm1,mm0
-        paddd     mm5,mm0
-        psrld     mm1,SCALEBITS         ; mm1=CrEL
-        psrld     mm5,SCALEBITS         ; mm5=CrEH
-        packssdw  mm1,mm5               ; mm1=CrE
-
-        psllw     mm7,BYTE_BIT
-        por       mm1,mm7               ; mm1=Cr
-        movq      MMWORD [edx], mm1     ; Save Cr
-
-        sub     ecx, byte SIZEOF_MMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
-        add     edi, byte SIZEOF_MMWORD                 ; outptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; outptr1
-        add     edx, byte SIZEOF_MMWORD                 ; outptr2
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        pop     ebx
-        pop     edx
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jccolext-sse2-64.asm b/media/libjpeg/simd/jccolext-sse2-64.asm
deleted file mode 100644
index 8e4642d3bc..0000000000
--- a/media/libjpeg/simd/jccolext-sse2-64.asm
+++ /dev/null
@@ -1,486 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          8
-
-        align   16
-
-        global  EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov rsi, r12
-        mov ecx, r13d
-        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
-        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov rsi, r11
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rdx
-        push    rbx
-        push    rdi
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr0
-        mov     rbx, JSAMPROW [rbx]     ; outptr1
-        mov     rdx, JSAMPROW [rdx]     ; outptr2
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    rax
-        push    rdx
-        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_BYTE
-        movzx   rax, BYTE [rsi+rcx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_WORD
-        movzx   rdx, WORD [rsi+rcx]
-        shl     rax, WORD_BIT
-        or      rax,rdx
-.column_ld4:
-        movd    xmmA,eax
-        pop     rdx
-        pop     rax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     rcx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
-        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
-        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
-        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        movdqa    xmm7,xmm1
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      xmm1,xmm1
-        pxor      xmm6,xmm6
-        punpcklwd xmm1,xmm5             ; xmm1=BOL
-        punpckhwd xmm6,xmm5             ; xmm6=BOH
-        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
-
-        movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm1
-        paddd     xmm4,xmm6
-        paddd     xmm7,xmm5
-        paddd     xmm4,xmm5
-        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
-        packssdw  xmm7,xmm4             ; xmm7=CbO
-
-        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      xmm0,xmm0
-        pxor      xmm6,xmm6
-        punpcklwd xmm0,xmm1             ; xmm0=BEL
-        punpckhwd xmm6,xmm1             ; xmm6=BEH
-        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
-
-        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm5,xmm0
-        paddd     xmm4,xmm6
-        paddd     xmm5,xmm1
-        paddd     xmm4,xmm1
-        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
-        packssdw  xmm5,xmm4             ; xmm5=CbE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm5,xmm7             ; xmm5=Cb
-        movdqa    XMMWORD [rbx], xmm5   ; Save Cb
-
-        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
-        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        movdqa    xmm7,xmm0
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, XMMWORD [wk(4)]
-        paddd     xmm4, XMMWORD [wk(5)]
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        pxor      xmm3,xmm3
-        pxor      xmm4,xmm4
-        punpcklwd xmm3,xmm1             ; xmm3=ROL
-        punpckhwd xmm4,xmm1             ; xmm4=ROH
-        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
-
-        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm3
-        paddd     xmm5,xmm4
-        paddd     xmm7,xmm1
-        paddd     xmm5,xmm1
-        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
-        packssdw  xmm7,xmm5             ; xmm7=CrO
-
-        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(6)]
-        paddd     xmm4, XMMWORD [wk(7)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [rdi], xmm6   ; Save Y
-
-        pxor      xmm2,xmm2
-        pxor      xmm4,xmm4
-        punpcklwd xmm2,xmm3             ; xmm2=REL
-        punpckhwd xmm4,xmm3             ; xmm4=REH
-        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
-
-        movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm1,xmm2
-        paddd     xmm5,xmm4
-        paddd     xmm1,xmm0
-        paddd     xmm5,xmm0
-        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
-        packssdw  xmm1,xmm5             ; xmm1=CrE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm1,xmm7             ; xmm1=Cr
-        movdqa    XMMWORD [rdx], xmm1   ; Save Cr
-
-        sub     rcx, byte SIZEOF_XMMWORD
-        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
-        add     rbx, byte SIZEOF_XMMWORD                ; outptr1
-        add     rdx, byte SIZEOF_XMMWORD                ; outptr2
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .column_ld1
-
-        pop     rcx                     ; col
-        pop     rsi
-        pop     rdi
-        pop     rbx
-        pop     rdx
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     rdi, byte SIZEOF_JSAMPROW
-        add     rbx, byte SIZEOF_JSAMPROW
-        add     rdx, byte SIZEOF_JSAMPROW
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jccolext-sse2.asm b/media/libjpeg/simd/jccolext-sse2.asm
deleted file mode 100644
index cc38e98a18..0000000000
--- a/media/libjpeg/simd/jccolext-sse2.asm
+++ /dev/null
@@ -1,503 +0,0 @@
-;
-; jccolext.asm - colorspace conversion (SSE2)
-;
-; x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          8
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-
-        global  EXTN(jsimd_rgb_ycc_convert_sse2)
-
-EXTN(jsimd_rgb_ycc_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edx
-        push    ebx
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        mov     ebx, JSAMPROW [ebx]     ; outptr1
-        mov     edx, JSAMPROW [edx]     ; outptr2
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        movzx   eax, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        movzx   edx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    xmmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     ecx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .rgb_ycc_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_ycc_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_ycc_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_ycc_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
-        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
-        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
-        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
-        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        movdqa    xmm7,xmm1
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
-        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        pxor      xmm1,xmm1
-        pxor      xmm6,xmm6
-        punpcklwd xmm1,xmm5             ; xmm1=BOL
-        punpckhwd xmm6,xmm5             ; xmm6=BOH
-        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
-
-        movdqa    xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm1
-        paddd     xmm4,xmm6
-        paddd     xmm7,xmm5
-        paddd     xmm4,xmm5
-        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
-        packssdw  xmm7,xmm4             ; xmm7=CbO
-
-        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm6
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
-
-        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        pxor      xmm0,xmm0
-        pxor      xmm6,xmm6
-        punpcklwd xmm0,xmm1             ; xmm0=BEL
-        punpckhwd xmm6,xmm1             ; xmm6=BEH
-        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
-        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
-
-        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm5,xmm0
-        paddd     xmm4,xmm6
-        paddd     xmm5,xmm1
-        paddd     xmm4,xmm1
-        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
-        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
-        packssdw  xmm5,xmm4             ; xmm5=CbE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm5,xmm7             ; xmm5=Cb
-        movdqa    XMMWORD [ebx], xmm5   ; Save Cb
-
-        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
-        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        movdqa    xmm7,xmm0
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
-
-        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, XMMWORD [wk(4)]
-        paddd     xmm4, XMMWORD [wk(5)]
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        pxor      xmm3,xmm3
-        pxor      xmm4,xmm4
-        punpcklwd xmm3,xmm1             ; xmm3=ROL
-        punpckhwd xmm4,xmm1             ; xmm4=ROH
-        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
-
-        movdqa    xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm7,xmm3
-        paddd     xmm5,xmm4
-        paddd     xmm7,xmm1
-        paddd     xmm5,xmm1
-        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
-        packssdw  xmm7,xmm5             ; xmm7=CrO
-
-        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm5,xmm4
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-        pmaddwd   xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
-
-        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(6)]
-        paddd     xmm4, XMMWORD [wk(7)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [edi], xmm6   ; Save Y
-
-        pxor      xmm2,xmm2
-        pxor      xmm4,xmm4
-        punpcklwd xmm2,xmm3             ; xmm2=REL
-        punpckhwd xmm4,xmm3             ; xmm4=REH
-        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
-        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
-
-        movdqa    xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ]
-
-        paddd     xmm1,xmm2
-        paddd     xmm5,xmm4
-        paddd     xmm1,xmm0
-        paddd     xmm5,xmm0
-        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
-        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
-        packssdw  xmm1,xmm5             ; xmm1=CrE
-
-        psllw     xmm7,BYTE_BIT
-        por       xmm1,xmm7             ; xmm1=Cr
-        movdqa    XMMWORD [edx], xmm1   ; Save Cr
-
-        sub     ecx, byte SIZEOF_XMMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     edi, byte SIZEOF_XMMWORD                ; outptr0
-        add     ebx, byte SIZEOF_XMMWORD                ; outptr1
-        add     edx, byte SIZEOF_XMMWORD                ; outptr2
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        pop     ebx
-        pop     edx
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jccolor-altivec.c b/media/libjpeg/simd/jccolor-altivec.c
deleted file mode 100644
index ec473320e5..0000000000
--- a/media/libjpeg/simd/jccolor-altivec.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> YCC CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_081 5329                 /* FIX(0.08131) */
-#define F_0_114 7471                 /* FIX(0.11400) */
-#define F_0_168 11059                /* FIX(0.16874) */
-#define F_0_250 16384                /* FIX(0.25000) */
-#define F_0_299 19595                /* FIX(0.29900) */
-#define F_0_331 21709                /* FIX(0.33126) */
-#define F_0_418 27439                /* FIX(0.41869) */
-#define F_0_500 32768                /* FIX(0.50000) */
-#define F_0_587 38470                /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extrgbx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extbgrx_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxbgr_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
-#define jsimd_rgb_ycc_convert_altivec jsimd_extxrgb_ycc_convert_altivec
-#include "jccolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/jccolor-mmx.asm b/media/libjpeg/simd/jccolor-mmx.asm
deleted file mode 100644
index c4e6d88be3..0000000000
--- a/media/libjpeg/simd/jccolor-mmx.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_mmx)
-
-EXTN(jconst_rgb_ycc_convert_mmx):
-
-PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 2 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 2 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 2 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgb_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extrgbx_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgr_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extbgrx_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxbgr_ycc_convert_mmx
-%include "jccolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_mmx jsimd_extxrgb_ycc_convert_mmx
-%include "jccolext-mmx.asm"
diff --git a/media/libjpeg/simd/jccolor-sse2-64.asm b/media/libjpeg/simd/jccolor-sse2-64.asm
deleted file mode 100644
index bd2188b4c0..0000000000
--- a/media/libjpeg/simd/jccolor-sse2-64.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_sse2)
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jccolext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jccolor-sse2.asm b/media/libjpeg/simd/jccolor-sse2.asm
deleted file mode 100644
index 13124d13d7..0000000000
--- a/media/libjpeg/simd/jccolor-sse2.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; jccolor.asm - colorspace conversion (SSE2)
-;
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_081 equ      5329                   ; FIX(0.08131)
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_168 equ     11059                   ; FIX(0.16874)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_331 equ     21709                   ; FIX(0.33126)
-F_0_418 equ     27439                   ; FIX(0.41869)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_ycc_convert_sse2)
-
-EXTN(jconst_rgb_ycc_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PW_MF016_MF033  times 4 dw -F_0_168,-F_0_331
-PW_MF008_MF041  times 4 dw -F_0_081,-F_0_418
-PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS)
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgb_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extrgbx_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgr_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extbgrx_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxbgr_ycc_convert_sse2
-%include "jccolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_ycc_convert_sse2 jsimd_extxrgb_ycc_convert_sse2
-%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/jcgray-altivec.c b/media/libjpeg/simd/jcgray-altivec.c
deleted file mode 100644
index 684df5ef1e..0000000000
--- a/media/libjpeg/simd/jcgray-altivec.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* RGB --> GRAYSCALE CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_114 7471                 /* FIX(0.11400) */
-#define F_0_250 16384                /* FIX(0.25000) */
-#define F_0_299 19595                /* FIX(0.29900) */
-#define F_0_587 38470                /* FIX(0.58700) */
-#define F_0_337 (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-
-#define RGBG_INDEX0 {0,1,3,4,6,7,9,10,2,1,5,4,8,7,11,10}
-#define RGBG_INDEX1 {12,13,15,16,18,19,21,22,14,13,17,16,20,19,23,22}
-#define RGBG_INDEX2 {8,9,11,12,14,15,17,18,10,9,13,12,16,15,19,18}
-#define RGBG_INDEX3 {4,5,7,8,10,11,13,14,6,5,9,8,12,11,15,14}
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGBG_INDEX {0,1,4,5,8,9,12,13,2,1,6,5,10,9,14,13}
-#define jsimd_rgb_gray_convert_altivec jsimd_extrgbx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGBG_INDEX0 {2,1,5,4,8,7,11,10,0,1,3,4,6,7,9,10}
-#define RGBG_INDEX1 {14,13,17,16,20,19,23,22,12,13,15,16,18,19,21,22}
-#define RGBG_INDEX2 {10,9,13,12,16,15,19,18,8,9,11,12,14,15,17,18}
-#define RGBG_INDEX3 {6,5,9,8,12,11,15,14,4,5,7,8,10,11,13,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX0
-#undef RGBG_INDEX1
-#undef RGBG_INDEX2
-#undef RGBG_INDEX3
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGBG_INDEX {2,1,6,5,10,9,14,13,0,1,4,5,8,9,12,13}
-#define jsimd_rgb_gray_convert_altivec jsimd_extbgrx_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGBG_INDEX {3,2,7,6,11,10,15,14,1,2,5,6,9,10,13,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extxbgr_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGBG_INDEX {1,2,5,6,9,10,13,14,3,2,7,6,11,10,15,14}
-#define jsimd_rgb_gray_convert_altivec jsimd_extxrgb_gray_convert_altivec
-#include "jcgryext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGBG_INDEX
-#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/jcgray-mmx.asm b/media/libjpeg/simd/jcgray-mmx.asm
deleted file mode 100644
index 0819b6ca01..0000000000
--- a/media/libjpeg/simd/jcgray-mmx.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_mmx)
-
-EXTN(jconst_rgb_gray_convert_mmx):
-
-PW_F0299_F0337  times 2 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 2 dw  F_0_114, F_0_250
-PD_ONEHALF      times 2 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgb_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extrgbx_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgr_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extbgrx_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxbgr_gray_convert_mmx
-%include "jcgryext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_mmx jsimd_extxrgb_gray_convert_mmx
-%include "jcgryext-mmx.asm"
diff --git a/media/libjpeg/simd/jcgray-sse2-64.asm b/media/libjpeg/simd/jcgray-sse2-64.asm
deleted file mode 100644
index bafd302aa5..0000000000
--- a/media/libjpeg/simd/jcgray-sse2-64.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_sse2)
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jcgray-sse2.asm b/media/libjpeg/simd/jcgray-sse2.asm
deleted file mode 100644
index 5b0b466953..0000000000
--- a/media/libjpeg/simd/jcgray-sse2.asm
+++ /dev/null
@@ -1,114 +0,0 @@
-;
-; jcgray.asm - grayscale colorspace conversion (SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_114 equ      7471                   ; FIX(0.11400)
-F_0_250 equ     16384                   ; FIX(0.25000)
-F_0_299 equ     19595                   ; FIX(0.29900)
-F_0_587 equ     38470                   ; FIX(0.58700)
-F_0_337 equ     (F_0_587 - F_0_250)     ; FIX(0.58700) - FIX(0.25000)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_rgb_gray_convert_sse2)
-
-EXTN(jconst_rgb_gray_convert_sse2):
-
-PW_F0299_F0337  times 4 dw  F_0_299, F_0_337
-PW_F0114_F0250  times 4 dw  F_0_114, F_0_250
-PD_ONEHALF      times 4 dd  (1 << (SCALEBITS-1))
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgb_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extrgbx_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgr_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extbgrx_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxbgr_gray_convert_sse2
-%include "jcgryext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_rgb_gray_convert_sse2 jsimd_extxrgb_gray_convert_sse2
-%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/jcgryext-mmx.asm b/media/libjpeg/simd/jcgryext-mmx.asm
deleted file mode 100644
index 1c1b8d8bc4..0000000000
--- a/media/libjpeg/simd/jcgryext-mmx.asm
+++ /dev/null
@@ -1,356 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width,
-;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                             JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_rgb_gray_convert_mmx)
-
-EXTN(jsimd_rgb_gray_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        xor     eax,eax
-        mov     al, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        xor     edx,edx
-        mov     dx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    mmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    mmG, DWORD [esi+ecx]
-        psllq   mmA, DWORD_BIT
-        por     mmA,mmG
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        movq    mmG,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        mov     ecx, SIZEOF_MMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld16:
-        test    cl, 2*SIZEOF_MMWORD
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_gray_cnv
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-        ; mmA=(00 10 20 01 11 21 02 12)
-        ; mmG=(22 03 13 23 04 14 24 05)
-        ; mmF=(15 25 06 16 26 07 17 27)
-
-        movq      mmD,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
-        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
-
-        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
-        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
-
-        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
-        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
-
-        movq      mmE,mmA
-        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
-        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
-
-        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
-
-        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
-        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
-
-        pxor      mmH,mmH
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
-
-        movq      mmB,mmE
-        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
-        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
-
-        movq      mmF,mmD
-        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
-        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_MMWORD/8
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_MMWORD/8
-        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_MMWORD/4
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_MMWORD/4
-        movq    mmF,mmA
-        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld4:
-        test    cl, SIZEOF_MMWORD/2
-        mov     ecx, SIZEOF_MMWORD
-        jz      short .rgb_gray_cnv
-        movq    mmD,mmA
-        movq    mmC,mmF
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
-        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
-
-.rgb_gray_cnv:
-        ; mmA=(00 10 20 30 01 11 21 31)
-        ; mmF=(02 12 22 32 03 13 23 33)
-        ; mmD=(04 14 24 34 05 15 25 35)
-        ; mmC=(06 16 26 36 07 17 27 37)
-
-        movq      mmB,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
-        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
-
-        movq      mmG,mmD
-        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
-        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
-
-        movq      mmE,mmA
-        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
-        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
-
-        movq      mmH,mmB
-        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
-        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
-
-        pxor      mmF,mmF
-
-        movq      mmC,mmA
-        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
-        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
-
-        movq      mmD,mmB
-        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
-        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
-
-        movq      mmG,mmE
-        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
-        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
-
-        punpcklbw mmF,mmH
-        punpckhbw mmH,mmH
-        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
-        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
-        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movq      mm6,mm1
-        punpcklwd mm1,mm3
-        punpckhwd mm6,mm3
-        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movq      mm7, mm6      ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movq      mm6,mm0
-        punpcklwd mm0,mm2
-        punpckhwd mm6,mm2
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movq      MMWORD [wk(0)], mm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movq      MMWORD [wk(1)], mm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movq      mm0, mm5      ; mm0=BO
-        movq      mm6, mm4      ; mm6=BE
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm3
-        punpckhwd mm4,mm3
-        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
-
-        paddd     mm0, mm1
-        paddd     mm4, mm7
-        paddd     mm0,mm3
-        paddd     mm4,mm3
-        psrld     mm0,SCALEBITS         ; mm0=YOL
-        psrld     mm4,SCALEBITS         ; mm4=YOH
-        packssdw  mm0,mm4               ; mm0=YO
-
-        movq      mm4,mm6
-        punpcklwd mm6,mm2
-        punpckhwd mm4,mm2
-        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
-
-        paddd     mm6, MMWORD [wk(0)]
-        paddd     mm4, MMWORD [wk(1)]
-        paddd     mm6,mm2
-        paddd     mm4,mm2
-        psrld     mm6,SCALEBITS         ; mm6=YEL
-        psrld     mm4,SCALEBITS         ; mm4=YEH
-        packssdw  mm6,mm4               ; mm6=YE
-
-        psllw     mm0,BYTE_BIT
-        por       mm6,mm0               ; mm6=Y
-        movq      MMWORD [edi], mm6     ; Save Y
-
-        sub     ecx, byte SIZEOF_MMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
-        add     edi, byte SIZEOF_MMWORD                 ; outptr0
-        cmp     ecx, byte SIZEOF_MMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcgryext-sse2-64.asm b/media/libjpeg/simd/jcgryext-sse2-64.asm
deleted file mode 100644
index 541355af86..0000000000
--- a/media/libjpeg/simd/jcgryext-sse2-64.asm
+++ /dev/null
@@ -1,365 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-; r10 = JDIMENSION img_width
-; r11 = JSAMPARRAY input_buf
-; r12 = JSAMPIMAGE output_buf
-; r13 = JDIMENSION output_row
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-
-        global  EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov rsi, r12
-        mov ecx, r13d
-        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov rsi, r11
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rdi
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr0
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    rax
-        push    rdx
-        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_BYTE
-        movzx   rax, BYTE [rsi+rcx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_WORD
-        movzx   rdx, WORD [rsi+rcx]
-        shl     rax, WORD_BIT
-        or      rax,rdx
-.column_ld4:
-        movd    xmmA,eax
-        pop     rdx
-        pop     rax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     rcx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [rsi+rcx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     rcx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     rcx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     rcx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    xmm0, xmm5    ; xmm0=BO
-        movdqa    xmm6, xmm4    ; xmm6=BE
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, xmm1
-        paddd     xmm4, xmm7
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(0)]
-        paddd     xmm4, XMMWORD [wk(1)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [rdi], xmm6   ; Save Y
-
-        sub     rcx, byte SIZEOF_XMMWORD
-        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .column_ld1
-
-        pop     rcx                     ; col
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     rdi, byte SIZEOF_JSAMPROW
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcgryext-sse2.asm b/media/libjpeg/simd/jcgryext-sse2.asm
deleted file mode 100644
index cd16dd1928..0000000000
--- a/media/libjpeg/simd/jcgryext-sse2.asm
+++ /dev/null
@@ -1,384 +0,0 @@
-;
-; jcgryext.asm - grayscale colorspace conversion (SSE2)
-;
-; Copyright (C) 2011, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
-;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-;                              JDIMENSION output_row, int num_rows);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION img_width
-%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
-%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
-%define output_row(b)   (b)+20          ; JDIMENSION output_row
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-
-        global  EXTN(jsimd_rgb_gray_convert_sse2)
-
-EXTN(jsimd_rgb_gray_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [img_width(eax)]
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     esi, JSAMPIMAGE [output_buf(eax)]
-        mov     ecx, JDIMENSION [output_row(eax)]
-        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
-        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     esi, JSAMPARRAY [input_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        pushpic eax
-        push    edi
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr0
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        alignx  16,7
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-.column_ld1:
-        push    eax
-        push    edx
-        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
-        test    cl, SIZEOF_BYTE
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_BYTE
-        movzx   eax, BYTE [esi+ecx]
-.column_ld2:
-        test    cl, SIZEOF_WORD
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_WORD
-        movzx   edx, WORD [esi+ecx]
-        shl     eax, WORD_BIT
-        or      eax,edx
-.column_ld4:
-        movd    xmmA,eax
-        pop     edx
-        pop     eax
-        test    cl, SIZEOF_DWORD
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_DWORD
-        movd    xmmF, XMM_DWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_DWORD
-        por     xmmA,xmmF
-.column_ld8:
-        test    cl, SIZEOF_MMWORD
-        jz      short .column_ld16
-        sub     ecx, byte SIZEOF_MMWORD
-        movq    xmmB, XMM_MMWORD [esi+ecx]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmB
-.column_ld16:
-        test    cl, SIZEOF_XMMWORD
-        jz      short .column_ld32
-        movdqa  xmmF,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .rgb_gray_cnv
-.column_ld32:
-        test    cl, 2*SIZEOF_XMMWORD
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmB,xmmA
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        movdqa    xmmG,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
-        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
-        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
-
-        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
-        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
-
-        movdqa    xmmD,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
-        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
-        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
-
-        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
-        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
-
-        movdqa    xmmE,xmmA
-        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
-        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
-
-        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
-
-        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
-
-        pxor      xmmH,xmmH
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmB,xmmE
-        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-
-        movdqa    xmmF,xmmD
-        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-.column_ld1:
-        test    cl, SIZEOF_XMMWORD/16
-        jz      short .column_ld2
-        sub     ecx, byte SIZEOF_XMMWORD/16
-        movd    xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld2:
-        test    cl, SIZEOF_XMMWORD/8
-        jz      short .column_ld4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        movq    xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE]
-        pslldq  xmmA, SIZEOF_MMWORD
-        por     xmmA,xmmE
-.column_ld4:
-        test    cl, SIZEOF_XMMWORD/4
-        jz      short .column_ld8
-        sub     ecx, byte SIZEOF_XMMWORD/4
-        movdqa  xmmE,xmmA
-        movdqu  xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE]
-.column_ld8:
-        test    cl, SIZEOF_XMMWORD/2
-        mov     ecx, SIZEOF_XMMWORD
-        jz      short .rgb_gray_cnv
-        movdqa  xmmF,xmmA
-        movdqa  xmmH,xmmE
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        jmp     short .rgb_gray_cnv
-        alignx  16,7
-
-.columnloop:
-        movdqu  xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqu  xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        movdqu  xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD]
-        movdqu  xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD]
-
-.rgb_gray_cnv:
-        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
-        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
-
-        movdqa    xmmC,xmmF
-        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
-        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
-
-        movdqa    xmmB,xmmA
-        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
-        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
-
-        movdqa    xmmG,xmmD
-        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
-        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
-
-        movdqa    xmmE,xmmA
-        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
-        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
-
-        movdqa    xmmH,xmmB
-        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
-        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
-
-        pxor      xmmF,xmmF
-
-        movdqa    xmmC,xmmA
-        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
-        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
-
-        movdqa    xmmD,xmmB
-        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
-        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
-
-        movdqa    xmmG,xmmE
-        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
-        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
-
-        punpcklbw xmmF,xmmH
-        punpckhbw xmmH,xmmH
-        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
-        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
-        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
-
-        ; (Original)
-        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
-        ;
-        ; (This implementation)
-        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
-
-        movdqa    xmm6,xmm1
-        punpcklwd xmm1,xmm3
-        punpckhwd xmm6,xmm3
-        pmaddwd   xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
-
-        movdqa    xmm6,xmm0
-        punpcklwd xmm0,xmm2
-        punpckhwd xmm6,xmm2
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
-        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
-
-        movdqa    xmm0, xmm5    ; xmm0=BO
-        movdqa    xmm6, xmm4    ; xmm6=BE
-
-        movdqa    xmm4,xmm0
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm4,xmm3
-        pmaddwd   xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
-
-        movdqa    xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF]
-
-        paddd     xmm0, xmm1
-        paddd     xmm4, xmm7
-        paddd     xmm0,xmm3
-        paddd     xmm4,xmm3
-        psrld     xmm0,SCALEBITS        ; xmm0=YOL
-        psrld     xmm4,SCALEBITS        ; xmm4=YOH
-        packssdw  xmm0,xmm4             ; xmm0=YO
-
-        movdqa    xmm4,xmm6
-        punpcklwd xmm6,xmm2
-        punpckhwd xmm4,xmm2
-        pmaddwd   xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
-        pmaddwd   xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
-
-        movdqa    xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF]
-
-        paddd     xmm6, XMMWORD [wk(0)]
-        paddd     xmm4, XMMWORD [wk(1)]
-        paddd     xmm6,xmm2
-        paddd     xmm4,xmm2
-        psrld     xmm6,SCALEBITS        ; xmm6=YEL
-        psrld     xmm4,SCALEBITS        ; xmm4=YEH
-        packssdw  xmm6,xmm4             ; xmm6=YE
-
-        psllw     xmm0,BYTE_BIT
-        por       xmm6,xmm0             ; xmm6=Y
-        movdqa    XMMWORD [edi], xmm6   ; Save Y
-
-        sub     ecx, byte SIZEOF_XMMWORD
-        add     esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
-        add     edi, byte SIZEOF_XMMWORD                ; outptr0
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .column_ld1
-
-        pop     ecx                     ; col
-        pop     esi
-        pop     edi
-        poppic  eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
-        add     edi, byte SIZEOF_JSAMPROW
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jchuff-sse2-64.asm b/media/libjpeg/simd/jchuff-sse2-64.asm
deleted file mode 100644
index b1144d1cdd..0000000000
--- a/media/libjpeg/simd/jchuff-sse2-64.asm
+++ /dev/null
@@ -1,360 +0,0 @@
-;
-; jchuff-sse2-64.asm - Huffman entropy encoding (64-bit SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-        sub put_bits, 8  ; put_bits -= 8;
-        mov rdx, put_buffer
-        mov ecx, put_bits
-        shr rdx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-        mov byte [buffer], dl  ; *buffer++ = c;
-        add buffer, 1
-        cmp dl, 0xFF  ; need to stuff a zero byte?
-        jne %%.EMIT_BYTE_END
-        mov byte [buffer], 0  ; *buffer++ = 0;
-        add buffer, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
-        add put_bits, ecx  ; put_bits += size;
-        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
-        or  put_buffer, %1
-%endmacro
-
-%macro CHECKBUF31 0
-        cmp put_bits, 32  ; if (put_bits > 31) {
-        jl %%.CHECKBUF31_END
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-%%.CHECKBUF31_END:
-%endmacro
-
-%macro CHECKBUF47 0
-        cmp put_bits, 48  ; if (put_bits > 47) {
-        jl %%.CHECKBUF47_END
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-        EMIT_BYTE
-%%.CHECKBUF47_END:
-%endmacro
-
-%macro EMIT_BITS 2
-        CHECKBUF47
-        mov ecx, %2
-        PUT_BITS %1
-%endmacro
-
-%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor xmm8, xmm8  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm9, xmm9  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm10, xmm10  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm11, xmm11  ; __m128i neg = _mm_setzero_si128();
-    pinsrw %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw %37, ebx, 7  ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw xmm8, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm9, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm10, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm11, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw %34, xmm8   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %35, xmm9   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %36, xmm10  ; x1 = _mm_add_epi16(x1, neg);
-    paddw %37, xmm11  ; x1 = _mm_add_epi16(x1, neg);
-    pxor %34, xmm8    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %35, xmm9    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %36, xmm10   ; x1 = _mm_xor_si128(x1, neg);
-    pxor %37, xmm11   ; x1 = _mm_xor_si128(x1, neg);
-    pxor xmm8, %34    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm9, %35    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm10, %36   ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm11, %37   ; neg = _mm_xor_si128(neg, x1);
-    movdqa XMMWORD [t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-;                                   JCOEFPTR block, int last_dc_val,
-;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; r10 = working_state *state
-; r11 = JOCTET *buffer
-; r12 = JCOEFPTR block
-; r13 = int last_dc_val
-; r14 = c_derived_tbl *dctbl
-; r15 = c_derived_tbl *actbl
-
-%define t1              rbp-(DCTSIZE2*SIZEOF_WORD)
-%define t2              t1-(DCTSIZE2*SIZEOF_WORD)
-%define put_buffer      r8
-%define put_bits        r9d
-%define buffer          rax
-
-        align   16
-        global  EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [t2]
-        collect_args
-%ifdef WIN64
-        movaps  XMMWORD [rsp-1*SIZEOF_XMMWORD], xmm8
-        movaps  XMMWORD [rsp-2*SIZEOF_XMMWORD], xmm9
-        movaps  XMMWORD [rsp-3*SIZEOF_XMMWORD], xmm10
-        movaps  XMMWORD [rsp-4*SIZEOF_XMMWORD], xmm11
-        sub     rsp, 4*SIZEOF_XMMWORD
-%endif
-        push rbx
-
-        mov buffer, r11  ; r11 is now sratch
-
-        mov put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-        mov put_bits,    DWORD [r10+24]  ; put_bits = state->cur.put_bits;
-        push r10  ; r10 is now scratch
-
-        ; Encode the DC coefficient difference per section F.1.2.1
-        movsx edi, word [r12]  ; temp = temp2 = block[0] - last_dc_val;
-        sub   edi, r13d  ; r13 is not used anymore
-        mov   ebx, edi
-
-        ; This is a well-known technique for obtaining the absolute value
-        ; without a branch.  It is derived from an assembly language technique
-        ; presented in "How to Optimize for the Pentium Processors",
-        ; Copyright (c) 1996, 1997 by Agner Fog.
-        mov esi, edi
-        sar esi, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-        xor edi, esi  ; temp ^= temp3;
-        sub edi, esi  ; temp -= temp3;
-
-        ; For a negative input, want temp2 = bitwise complement of abs(input)
-        ; This code assumes we are on a two's complement machine
-        add ebx, esi  ; temp2 += temp3;
-
-        ; Find the number of bits needed for the magnitude of the coefficient
-        lea   r11, [rel jpeg_nbits_table]
-        movzx rdi, byte [r11 + rdi]  ; nbits = JPEG_NBITS(temp);
-        ; Emit the Huffman-coded symbol for the number of bits
-        mov   r11d,  INT [r14 + rdi * 4]  ; code = dctbl->ehufco[nbits];
-        movzx  esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
-        EMIT_BITS r11, esi  ; EMIT_BITS(code, size)
-
-        ; Mask off any extra bits in code
-        mov esi, 1
-        mov ecx, edi
-        shl esi, cl
-        dec esi
-        and ebx, esi  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-        ; Emit that number of bits of the value, if positive,
-        ; or the complement of its magnitude, if negative.
-        EMIT_BITS rbx, edi  ; EMIT_BITS(temp2, nbits)
-
-        ; Prepare data
-        xor ebx, ebx
-        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                       xmm0, xmm1, xmm2, xmm3
-        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                       xmm4, xmm5, xmm6, xmm7
-
-        pxor xmm8, xmm8
-        pcmpeqw xmm0, xmm8  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm8  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm8  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm8  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        pcmpeqw xmm4, xmm8  ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
-        pcmpeqw xmm5, xmm8  ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
-        pcmpeqw xmm6, xmm8  ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
-        pcmpeqw xmm7, xmm8  ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        packsswb xmm4, xmm5  ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
-        packsswb xmm6, xmm7  ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
-        pmovmskb r11d, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb r12d, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        pmovmskb r13d, xmm4  ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
-        pmovmskb r14d, xmm6  ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
-        shl r12, 16
-        shl r14, 16
-        or  r11, r12
-        or  r13, r14
-        shl r13, 32
-        or  r11, r13
-        not r11  ; index = ~index;
-
-        ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
-        ;jmp .EFN
-
-        mov   r13d,  INT [r15 + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        lea rsi, [t1]
-.BLOOP:
-        bsf r12, r11  ; r = __builtin_ctzl(index);
-        jz .ELOOP
-        mov rcx, r12
-        lea rsi, [rsi+r12*2]  ; k += r;
-        shr r11, cl  ; index >>= r;
-        movzx rdi, word [rsi]  ; temp = t1[k];
-        lea   rbx, [rel jpeg_nbits_table]
-        movzx rdi, byte [rbx + rdi]  ; nbits = JPEG_NBITS(temp);
-.BRLOOP:
-        cmp r12, 16  ; while (r > 15) {
-        jl .ERLOOP
-        EMIT_BITS r13, r14d  ; EMIT_BITS(code_0xf0, size_0xf0)
-        sub r12, 16  ; r -= 16;
-        jmp .BRLOOP
-.ERLOOP:
-        ; Emit Huffman symbol for run length / number of bits
-        CHECKBUF31  ; uses rcx, rdx
-
-        shl r12, 4  ; temp3 = (r << 4) + nbits;
-        add r12, rdi
-        mov   ebx,  INT [r15 + r12 * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
-        PUT_BITS rbx
-
-        ;EMIT_CODE(code, size)
-
-        movsx ebx, word [rsi-DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov rcx, rdi
-        mov rdx, 1
-        shl rdx, cl
-        dec rdx
-        and rbx, rdx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        PUT_BITS rbx  ; PUT_BITS(temp2, nbits)
-
-        shr r11, 1  ; index >>= 1;
-        add rsi, 2  ; ++k;
-        jmp .BLOOP
-.ELOOP:
-        ; If the last coef(s) were zero, emit an end-of-block code
-        lea rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-        cmp rdi, rsi  ; if (r > 0) {
-        je .EFN
-        mov   ebx,  INT [r15]  ; code = actbl->ehufco[0];
-        movzx r12d, byte [r15 + 1024]  ; size = actbl->ehufsi[0];
-        EMIT_BITS rbx, r12d
-.EFN:
-        pop r10
-        ; Save put_buffer & put_bits
-        mov MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-        mov DWORD  [r10+24], put_bits  ; state->cur.put_bits = put_bits;
-
-        pop rbx
-%ifdef WIN64
-        movaps  xmm11, XMMWORD [rsp+0*SIZEOF_XMMWORD]
-        movaps  xmm10, XMMWORD [rsp+1*SIZEOF_XMMWORD]
-        movaps  xmm9, XMMWORD [rsp+2*SIZEOF_XMMWORD]
-        movaps  xmm8, XMMWORD [rsp+3*SIZEOF_XMMWORD]
-        add     rsp, 4*SIZEOF_XMMWORD
-%endif
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jchuff-sse2.asm b/media/libjpeg/simd/jchuff-sse2.asm
deleted file mode 100644
index 36d1f2db66..0000000000
--- a/media/libjpeg/simd/jchuff-sse2.asm
+++ /dev/null
@@ -1,426 +0,0 @@
-;
-; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
-;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
-; Copyright (C) 2015, Matthieu Darbois.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_huff_encode_one_block)
-
-EXTN(jconst_huff_encode_one_block):
-
-%include "jpeg_nbits_table.inc"
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-        sub put_bits, 8  ; put_bits -= 8;
-        mov edx, put_buffer
-        mov ecx, put_bits
-        shr edx, cl  ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-        mov byte [eax], dl  ; *buffer++ = c;
-        add eax, 1
-        cmp dl, 0xFF  ; need to stuff a zero byte?
-        jne %%.EMIT_BYTE_END
-        mov byte [eax], 0  ; *buffer++ = 0;
-        add eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
-        add put_bits, ecx  ; put_bits += size;
-        shl put_buffer, cl  ; put_buffer = (put_buffer << size);
-        or  put_buffer, %1
-%endmacro
-
-%macro CHECKBUF15 0
-        cmp put_bits, 16  ; if (put_bits > 31) {
-        jl %%.CHECKBUF15_END
-        mov eax, POINTER [esp+buffer]
-        EMIT_BYTE
-        EMIT_BYTE
-        mov POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
-%endmacro
-
-%macro EMIT_BITS 1
-        PUT_BITS %1
-        CHECKBUF15
-%endmacro
-
-%macro kloop_prepare 37  ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor xmm4, xmm4  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm5, xmm5  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm6, xmm6  ; __m128i neg = _mm_setzero_si128();
-    pxor xmm7, xmm7  ; __m128i neg = _mm_setzero_si128();
-    pinsrw %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw %37, ecx, 7  ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw xmm4, %34  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm5, %35  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm6, %36  ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw xmm7, %37  ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw %34, xmm4   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %35, xmm5   ; x1 = _mm_add_epi16(x1, neg);
-    paddw %36, xmm6  ; x1 = _mm_add_epi16(x1, neg);
-    paddw %37, xmm7  ; x1 = _mm_add_epi16(x1, neg);
-    pxor %34, xmm4    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %35, xmm5    ; x1 = _mm_xor_si128(x1, neg);
-    pxor %36, xmm6   ; x1 = _mm_xor_si128(x1, neg);
-    pxor %37, xmm7   ; x1 = _mm_xor_si128(x1, neg);
-    pxor xmm4, %34    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm5, %35    ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm6, %36   ; neg = _mm_xor_si128(neg, x1);
-    pxor xmm7, %37   ; neg = _mm_xor_si128(neg, x1);
-    movdqa XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34  ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35  ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36  ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37  ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4  ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5  ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
-%endmacro
-
-;
-; Encode a single block's worth of coefficients.
-;
-; GLOBAL(JOCTET*)
-; jsimd_huff_encode_one_block_sse2 (working_state *state, JOCTET *buffer,
-;                                   JCOEFPTR block, int last_dc_val,
-;                                   c_derived_tbl *dctbl, c_derived_tbl *actbl)
-;
-
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
-
-%define pad             6*SIZEOF_DWORD  ; Align to 16 bytes
-%define t1              pad
-%define t2              t1+(DCTSIZE2*SIZEOF_WORD)
-%define block           t2+(DCTSIZE2*SIZEOF_WORD)
-%define actbl           block+SIZEOF_DWORD
-%define buffer          actbl+SIZEOF_DWORD
-%define temp            buffer+SIZEOF_DWORD
-%define temp2           temp+SIZEOF_DWORD
-%define temp3           temp2+SIZEOF_DWORD
-%define temp4           temp3+SIZEOF_DWORD
-%define temp5           temp4+SIZEOF_DWORD
-%define gotptr          temp5+SIZEOF_DWORD  ; void *gotptr
-%define put_buffer      ebx
-%define put_bits        edi
-
-        align   16
-        global  EXTN(jsimd_huff_encode_one_block_sse2)
-
-EXTN(jsimd_huff_encode_one_block_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        sub     esp, temp5+9*SIZEOF_DWORD-pad
-        push    ebx
-        push    ecx
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-        push    ebp
-
-        mov esi, POINTER [eax+8]        ; (working_state *state)
-        mov put_buffer,  DWORD [esi+8]  ; put_buffer = state->cur.put_buffer;
-        mov put_bits,    DWORD [esi+12]  ; put_bits = state->cur.put_bits;
-        push esi  ; esi is now scratch
-
-        get_GOT edx                       ; get GOT address
-        movpic POINTER [esp+gotptr], edx  ; save GOT address
-
-        mov ecx, POINTER [eax+28]
-        mov edx, POINTER [eax+16]
-        mov esi, POINTER [eax+12]
-        mov POINTER [esp+actbl],  ecx
-        mov POINTER [esp+block],  edx
-        mov POINTER [esp+buffer], esi
-
-        ; Encode the DC coefficient difference per section F.1.2.1
-        mov esi, POINTER [esp+block]        ; block
-        movsx ecx, word [esi]  ; temp = temp2 = block[0] - last_dc_val;
-        sub   ecx, DWORD [eax+20]
-        mov   esi, ecx
-
-        ; This is a well-known technique for obtaining the absolute value
-        ; without a branch.  It is derived from an assembly language technique
-        ; presented in "How to Optimize for the Pentium Processors",
-        ; Copyright (c) 1996, 1997 by Agner Fog.
-        mov edx, ecx
-        sar edx, 31   ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-        xor ecx, edx ; temp ^= temp3;
-        sub ecx, edx ; temp -= temp3;
-
-        ; For a negative input, want temp2 = bitwise complement of abs(input)
-        ; This code assumes we are on a two's complement machine
-        add esi, edx  ; temp2 += temp3;
-        mov DWORD [esp+temp], esi  ; backup temp2 in temp
-
-        ; Find the number of bits needed for the magnitude of the coefficient
-        movpic ebp, POINTER [esp+gotptr]   ; load GOT address (ebp)
-        movzx edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-        mov DWORD [esp+temp2], edx  ; backup nbits in temp2
-
-        ; Emit the Huffman-coded symbol for the number of bits
-        mov    ebp, POINTER [eax+24]  ; After this point, arguments are not accessible anymore
-        mov    eax,  INT [ebp + edx * 4]  ; code = dctbl->ehufco[nbits];
-        movzx  ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
-        EMIT_BITS eax  ; EMIT_BITS(code, size)
-
-        mov ecx, DWORD [esp+temp2]  ; restore nbits
-
-        ; Mask off any extra bits in code
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, DWORD [esp+temp]  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-
-        ; Emit that number of bits of the value, if positive,
-        ; or the complement of its magnitude, if negative.
-        EMIT_BITS eax  ; EMIT_BITS(temp2, nbits)
-
-        ; Prepare data
-        xor ecx, ecx
-        mov esi, POINTER [esp+block]
-        kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                       18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                       27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                       xmm0, xmm1, xmm2, xmm3
-        kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                       30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                       53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                       xmm0, xmm1, xmm2, xmm3
-
-        pxor xmm7, xmm7
-        movdqa xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-        movdqa xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-        movdqa xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-        movdqa xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        shl ecx, 16
-        or  edx, ecx
-        not edx  ; index = ~index;
-
-        lea esi, [esp+t1]
-        mov ebp, POINTER [esp+actbl]  ; ebp = actbl
-
-.BLOOP:
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz .ELOOP
-        lea esi, [esi+ecx*2]  ; k += r;
-        shr edx, cl  ; index >>= r;
-        mov DWORD [esp+temp3], edx
-.BRLOOP:
-        cmp ecx, 16  ; while (r > 15) {
-        jl .ERLOOP
-        sub ecx, 16 ; r -= 16;
-        mov DWORD [esp+temp], ecx
-        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
-        mov ecx, DWORD [esp+temp]
-        jmp .BRLOOP
-.ERLOOP:
-        movsx eax, word [esi]  ; temp = t1[k];
-        movpic edx, POINTER [esp+gotptr]   ; load GOT address (edx)
-        movzx eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-        mov DWORD [esp+temp2], eax
-        ; Emit Huffman symbol for run length / number of bits
-        shl ecx, 4  ; temp3 = (r << 4) + nbits;
-        add ecx, eax
-        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-        EMIT_BITS eax
-
-        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov ecx, DWORD [esp+temp2]
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
-        mov edx, DWORD [esp+temp3]
-        add esi, 2  ; ++k;
-        shr edx, 1  ; index >>= 1;
-
-        jmp .BLOOP
-.ELOOP:
-        movdqa xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-        movdqa xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-        movdqa xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-        movdqa xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-        pcmpeqw xmm0, xmm7  ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-        pcmpeqw xmm1, xmm7  ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-        pcmpeqw xmm2, xmm7  ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-        pcmpeqw xmm3, xmm7  ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-        packsswb xmm0, xmm1  ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-        packsswb xmm2, xmm3  ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-        pmovmskb edx, xmm0  ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-        pmovmskb ecx, xmm2  ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-        shl ecx, 16
-        or  edx, ecx
-        not edx  ; index = ~index;
-
-        lea eax, [esp + t1 + (DCTSIZE2/2) * 2]
-        sub eax, esi
-        shr eax, 1
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz .ELOOP2
-        shr edx, cl  ; index >>= r;
-        add ecx, eax
-        lea esi, [esi+ecx*2]  ; k += r;
-        mov DWORD [esp+temp3], edx
-        jmp .BRLOOP2
-.BLOOP2:
-        bsf ecx, edx  ; r = __builtin_ctzl(index);
-        jz .ELOOP2
-        lea esi, [esi+ecx*2]  ; k += r;
-        shr edx, cl  ; index >>= r;
-        mov DWORD [esp+temp3], edx
-.BRLOOP2:
-        cmp ecx, 16  ; while (r > 15) {
-        jl .ERLOOP2
-        sub ecx, 16  ; r -= 16;
-        mov DWORD [esp+temp], ecx
-        mov   eax, INT [ebp + 240 * 4]  ; code_0xf0 = actbl->ehufco[0xf0];
-        movzx ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-        EMIT_BITS eax  ; EMIT_BITS(code_0xf0, size_0xf0)
-        mov ecx, DWORD [esp+temp]
-        jmp .BRLOOP2
-.ERLOOP2:
-        movsx eax, word [esi]  ; temp = t1[k];
-        bsr eax, eax  ; nbits = 32 - __builtin_clz(temp);
-        inc eax
-        mov DWORD [esp+temp2], eax
-        ; Emit Huffman symbol for run length / number of bits
-        shl ecx, 4  ; temp3 = (r << 4) + nbits;
-        add ecx, eax
-        mov   eax,  INT [ebp + ecx * 4]  ; code = actbl->ehufco[temp3];
-        movzx ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-        EMIT_BITS eax
-
-        movsx edx, word [esi+DCTSIZE2*2]  ; temp2 = t2[k];
-        ; Mask off any extra bits in code
-        mov ecx, DWORD [esp+temp2]
-        mov eax, 1
-        shl eax, cl
-        dec eax
-        and eax, edx  ; temp2 &= (((JLONG) 1)<<nbits) - 1;
-        EMIT_BITS eax  ; PUT_BITS(temp2, nbits)
-        mov edx, DWORD [esp+temp3]
-        add esi, 2  ; ++k;
-        shr edx, 1  ; index >>= 1;
-
-        jmp .BLOOP2
-.ELOOP2:
-        ; If the last coef(s) were zero, emit an end-of-block code
-        lea edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-        cmp edx, esi  ; if (r > 0) {
-        je .EFN
-        mov   eax,  INT [ebp]  ; code = actbl->ehufco[0];
-        movzx ecx, byte [ebp + 1024]  ; size = actbl->ehufsi[0];
-        EMIT_BITS eax
-.EFN:
-        mov eax, [esp+buffer]
-        pop esi
-        ; Save put_buffer & put_bits
-        mov DWORD [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-        mov DWORD [esi+12], put_bits  ; state->cur.put_bits = put_bits;
-
-        pop     ebp
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-        pop     ecx
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcolsamp.inc b/media/libjpeg/simd/jcolsamp.inc
deleted file mode 100644
index 3be446e847..0000000000
--- a/media/libjpeg/simd/jcolsamp.inc
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; jcolsamp.inc - private declarations for color conversion & up/downsampling
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
-
-; --------------------------------------------------------------------------
-
-; pseudo-resisters to make ordering of RGB configurable
-;
-%if RGB_RED == 0
-%define  mmA  mm0
-%define  mmB  mm1
-%define xmmA xmm0
-%define xmmB xmm1
-%elif RGB_GREEN == 0
-%define  mmA  mm2
-%define  mmB  mm3
-%define xmmA xmm2
-%define xmmB xmm3
-%elif RGB_BLUE == 0
-%define  mmA  mm4
-%define  mmB  mm5
-%define xmmA xmm4
-%define xmmB xmm5
-%else
-%define  mmA  mm6
-%define  mmB  mm7
-%define xmmA xmm6
-%define xmmB xmm7
-%endif
-
-%if RGB_RED == 1
-%define  mmC  mm0
-%define  mmD  mm1
-%define xmmC xmm0
-%define xmmD xmm1
-%elif RGB_GREEN == 1
-%define  mmC  mm2
-%define  mmD  mm3
-%define xmmC xmm2
-%define xmmD xmm3
-%elif RGB_BLUE == 1
-%define  mmC  mm4
-%define  mmD  mm5
-%define xmmC xmm4
-%define xmmD xmm5
-%else
-%define  mmC  mm6
-%define  mmD  mm7
-%define xmmC xmm6
-%define xmmD xmm7
-%endif
-
-%if RGB_RED == 2
-%define  mmE  mm0
-%define  mmF  mm1
-%define xmmE xmm0
-%define xmmF xmm1
-%elif RGB_GREEN == 2
-%define  mmE  mm2
-%define  mmF  mm3
-%define xmmE xmm2
-%define xmmF xmm3
-%elif RGB_BLUE == 2
-%define  mmE  mm4
-%define  mmF  mm5
-%define xmmE xmm4
-%define xmmF xmm5
-%else
-%define  mmE  mm6
-%define  mmF  mm7
-%define xmmE xmm6
-%define xmmF xmm7
-%endif
-
-%if RGB_RED == 3
-%define  mmG  mm0
-%define  mmH  mm1
-%define xmmG xmm0
-%define xmmH xmm1
-%elif RGB_GREEN == 3
-%define  mmG  mm2
-%define  mmH  mm3
-%define xmmG xmm2
-%define xmmH xmm3
-%elif RGB_BLUE == 3
-%define  mmG  mm4
-%define  mmH  mm5
-%define xmmG xmm4
-%define xmmH xmm5
-%else
-%define  mmG  mm6
-%define  mmH  mm7
-%define xmmG xmm6
-%define xmmH xmm7
-%endif
-
-; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jcsample-mmx.asm b/media/libjpeg/simd/jcsample-mmx.asm
deleted file mode 100644
index 6cd544e74d..0000000000
--- a/media/libjpeg/simd/jcsample-mmx.asm
+++ /dev/null
@@ -1,323 +0,0 @@
-;
-; jcsample.asm - downsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_mmx)
-
-EXTN(jsimd_h2v1_downsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov       edx, 0x00010000       ; bias pattern
-        movd      mm7,edx
-        pcmpeqw   mm6,mm6
-        punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
-        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
-        movq    mm2,mm0
-        movq    mm3,mm1
-
-        pand    mm0,mm6
-        psrlw   mm2,BYTE_BIT
-        pand    mm1,mm6
-        psrlw   mm3,BYTE_BIT
-
-        paddw   mm0,mm2
-        paddw   mm1,mm3
-        paddw   mm0,mm7
-        paddw   mm1,mm7
-        psrlw   mm0,1
-        psrlw   mm1,1
-
-        packuswb mm0,mm1
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
-        sub     ecx, byte SIZEOF_MMWORD         ; outcol
-        jnz     short .columnloop
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     eax                             ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
-;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_mmx)
-
-EXTN(jsimd_h2v2_downsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov       edx, 0x00020001       ; bias pattern
-        movd      mm7,edx
-        pcmpeqw   mm6,mm6
-        punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
-        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     edi, JSAMPROW [edi]                     ; outptr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
-        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
-        movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pand    mm0,mm6
-        psrlw   mm4,BYTE_BIT
-        pand    mm1,mm6
-        psrlw   mm5,BYTE_BIT
-        paddw   mm0,mm4
-        paddw   mm1,mm5
-
-        movq    mm4,mm2
-        movq    mm5,mm3
-        pand    mm2,mm6
-        psrlw   mm4,BYTE_BIT
-        pand    mm3,mm6
-        psrlw   mm5,BYTE_BIT
-        paddw   mm2,mm4
-        paddw   mm3,mm5
-
-        paddw   mm0,mm1
-        paddw   mm2,mm3
-        paddw   mm0,mm7
-        paddw   mm2,mm7
-        psrlw   mm0,2
-        psrlw   mm2,2
-
-        packuswb mm0,mm2
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-
-        add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
-        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
-        sub     ecx, byte SIZEOF_MMWORD         ; outcol
-        jnz     near .columnloop
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcsample-sse2-64.asm b/media/libjpeg/simd/jcsample-sse2-64.asm
deleted file mode 100644
index 40ee15fcbb..0000000000
--- a/media/libjpeg/simd/jcsample-sse2-64.asm
+++ /dev/null
@@ -1,329 +0,0 @@
-;
-; jcsample.asm - downsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov ecx, r13d
-        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
-        jz      near .return
-
-        mov edx, r10d
-
-        ; -- expand_right_edge
-
-        push    rcx
-        shl     rcx,1                           ; output_cols * 2
-        sub     rcx,rdx
-        jle     short .expand_end
-
-        mov     rax, r11
-        test    rax,rax
-        jle     short .expand_end
-
-        cld
-        mov     rsi, r14        ; input_data
-.expandloop:
-        push    rax
-        push    rcx
-
-        mov     rdi, JSAMPROW [rsi]
-        add     rdi,rdx
-        mov     al, JSAMPLE [rdi-1]
-
-        rep stosb
-
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        dec     rax
-        jg      short .expandloop
-
-.expand_end:
-        pop     rcx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, r12d        ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     rdx, 0x00010000         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     rsi, r14        ; input_data
-        mov     rdi, r15        ; output_data
-.rowloop:
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]             ; inptr
-        mov rdi, JSAMPROW [rdi]         ; outptr
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        pxor    xmm1,xmm1
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .downsample
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm2,xmm0
-        movdqa  xmm3,xmm1
-
-        pand    xmm0,xmm6
-        psrlw   xmm2,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm3,BYTE_BIT
-
-        paddw   xmm0,xmm2
-        paddw   xmm1,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-        psrlw   xmm0,1
-        psrlw   xmm1,1
-
-        packuswb xmm0,xmm1
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        test    rcx,rcx
-        jnz     short .columnloop_r8
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-; r10 = JDIMENSION image_width
-; r11 = int max_v_samp_factor
-; r12 = JDIMENSION v_samp_factor
-; r13 = JDIMENSION width_blocks
-; r14 = JSAMPARRAY input_data
-; r15 = JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     ecx, r13d
-        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
-        jz      near .return
-
-        mov     edx, r10d
-
-        ; -- expand_right_edge
-
-        push    rcx
-        shl     rcx,1                           ; output_cols * 2
-        sub     rcx,rdx
-        jle     short .expand_end
-
-        mov     rax, r11
-        test    rax,rax
-        jle     short .expand_end
-
-        cld
-        mov     rsi, r14        ; input_data
-.expandloop:
-        push    rax
-        push    rcx
-
-        mov     rdi, JSAMPROW [rsi]
-        add     rdi,rdx
-        mov     al, JSAMPLE [rdi-1]
-
-        rep stosb
-
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        dec     rax
-        jg      short .expandloop
-
-.expand_end:
-        pop     rcx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, r12d        ; rowctr
-        test    rax,rax
-        jle     near .return
-
-        mov     rdx, 0x00020001         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     rsi, r14        ; input_data
-        mov     rdi, r15        ; output_data
-.rowloop:
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     rdi, JSAMPROW [rdi]                     ; outptr
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        pxor    xmm2,xmm2
-        pxor    xmm3,xmm3
-        mov     rcx, SIZEOF_XMMWORD
-        jmp     short .downsample
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        pand    xmm0,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm0,xmm4
-        paddw   xmm1,xmm5
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm5,xmm3
-        pand    xmm2,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm3,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm2,xmm4
-        paddw   xmm3,xmm5
-
-        paddw   xmm0,xmm1
-        paddw   xmm2,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm7
-        psrlw   xmm0,2
-        psrlw   xmm2,2
-
-        packuswb xmm0,xmm2
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
-        add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
-        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    rcx,rcx
-        jnz     near .columnloop_r8
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-
-        add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     rax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jcsample-sse2.asm b/media/libjpeg/simd/jcsample-sse2.asm
deleted file mode 100644
index 83c9d152a7..0000000000
--- a/media/libjpeg/simd/jcsample-sse2.asm
+++ /dev/null
@@ -1,350 +0,0 @@
-;
-; jcsample.asm - downsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Downsample pixel values of a single component.
-; This version handles the common case of 2:1 horizontal and 1:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v1_downsample_sse2)
-
-EXTN(jsimd_h2v1_downsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v1_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     edx, 0x00010000         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        pxor    xmm1,xmm1
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .downsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm2,xmm0
-        movdqa  xmm3,xmm1
-
-        pand    xmm0,xmm6
-        psrlw   xmm2,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm3,BYTE_BIT
-
-        paddw   xmm0,xmm2
-        paddw   xmm1,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-        psrlw   xmm0,1
-        psrlw   xmm1,1
-
-        packuswb xmm0,xmm1
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        test    ecx,ecx
-        jnz     short .columnloop_r8
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Downsample pixel values of a single component.
-; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
-; without smoothing.
-;
-; GLOBAL(void)
-; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
-;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
-;
-
-%define img_width(b)    (b)+8           ; JDIMENSION image_width
-%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
-%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
-%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
-%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
-%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
-
-        align   16
-        global  EXTN(jsimd_h2v2_downsample_sse2)
-
-EXTN(jsimd_h2v2_downsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     ecx, JDIMENSION [width_blks(ebp)]
-        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
-        jz      near .return
-
-        mov     edx, JDIMENSION [img_width(ebp)]
-
-        ; -- expand_right_edge
-
-        push    ecx
-        shl     ecx,1                           ; output_cols * 2
-        sub     ecx,edx
-        jle     short .expand_end
-
-        mov     eax, INT [max_v_samp(ebp)]
-        test    eax,eax
-        jle     short .expand_end
-
-        cld
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        alignx  16,7
-.expandloop:
-        push    eax
-        push    ecx
-
-        mov     edi, JSAMPROW [esi]
-        add     edi,edx
-        mov     al, JSAMPLE [edi-1]
-
-        rep stosb
-
-        pop     ecx
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        dec     eax
-        jg      short .expandloop
-
-.expand_end:
-        pop     ecx                             ; output_cols
-
-        ; -- h2v2_downsample
-
-        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
-        test    eax,eax
-        jle     near .return
-
-        mov     edx, 0x00020001         ; bias pattern
-        movd    xmm7,edx
-        pcmpeqw xmm6,xmm6
-        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
-        alignx  16,7
-.rowloop:
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
-        mov     edi, JSAMPROW [edi]                     ; outptr
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     short .columnloop
-        alignx  16,7
-
-.columnloop_r8:
-        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        pxor    xmm2,xmm2
-        pxor    xmm3,xmm3
-        mov     ecx, SIZEOF_XMMWORD
-        jmp     short .downsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-.downsample:
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        pand    xmm0,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm1,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm0,xmm4
-        paddw   xmm1,xmm5
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm5,xmm3
-        pand    xmm2,xmm6
-        psrlw   xmm4,BYTE_BIT
-        pand    xmm3,xmm6
-        psrlw   xmm5,BYTE_BIT
-        paddw   xmm2,xmm4
-        paddw   xmm3,xmm5
-
-        paddw   xmm0,xmm1
-        paddw   xmm2,xmm3
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm7
-        psrlw   xmm0,2
-        psrlw   xmm2,2
-
-        packuswb xmm0,xmm2
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-
-        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
-        add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
-        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jae     near .columnloop
-        test    ecx,ecx
-        jnz     near .columnloop_r8
-
-        pop     esi
-        pop     edi
-        pop     ecx
-
-        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
-        dec     eax                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolext-mmx.asm b/media/libjpeg/simd/jdcolext-mmx.asm
deleted file mode 100644
index 21e34f6786..0000000000
--- a/media/libjpeg/simd/jdcolext-mmx.asm
+++ /dev/null
@@ -1,404 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
-;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                            JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)    (b)+8           ; JDIMENSION out_width
-%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
-%define input_row(b)    (b)+16          ; JDIMENSION input_row
-%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_mmx)
-
-EXTN(jsimd_ycc_rgb_convert_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [input_row(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        push    eax
-        push    edi
-        push    edx
-        push    ebx
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr0
-        mov     ebx, JSAMPROW [ebx]     ; inptr1
-        mov     edx, JSAMPROW [edx]     ; inptr2
-        mov     edi, JSAMPROW [edi]     ; outptr
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-        alignx  16,7
-.columnloop:
-
-        movq    mm5, MMWORD [ebx]       ; mm5=Cb(01234567)
-        movq    mm1, MMWORD [edx]       ; mm1=Cr(01234567)
-
-        pcmpeqw mm4,mm4
-        pcmpeqw mm7,mm7
-        psrlw   mm4,BYTE_BIT
-        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-        movq    mm0,mm4                 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    mm4,mm5                 ; mm4=Cb(0246)=CbE
-        psrlw   mm5,BYTE_BIT            ; mm5=Cb(1357)=CbO
-        pand    mm0,mm1                 ; mm0=Cr(0246)=CrE
-        psrlw   mm1,BYTE_BIT            ; mm1=Cr(1357)=CrO
-
-        paddw   mm4,mm7
-        paddw   mm5,mm7
-        paddw   mm0,mm7
-        paddw   mm1,mm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movq    mm2,mm4                 ; mm2=CbE
-        movq    mm3,mm5                 ; mm3=CbO
-        paddw   mm4,mm4                 ; mm4=2*CbE
-        paddw   mm5,mm5                 ; mm5=2*CbO
-        movq    mm6,mm0                 ; mm6=CrE
-        movq    mm7,mm1                 ; mm7=CrO
-        paddw   mm0,mm0                 ; mm0=2*CrE
-        paddw   mm1,mm1                 ; mm1=2*CrO
-
-        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbE * -FIX(0.22800))
-        pmulhw  mm5,[GOTOFF(eax,PW_MF0228)]     ; mm5=(2*CbO * -FIX(0.22800))
-        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrE * FIX(0.40200))
-        pmulhw  mm1,[GOTOFF(eax,PW_F0402)]      ; mm1=(2*CrO * FIX(0.40200))
-
-        paddw   mm4,[GOTOFF(eax,PW_ONE)]
-        paddw   mm5,[GOTOFF(eax,PW_ONE)]
-        psraw   mm4,1                   ; mm4=(CbE * -FIX(0.22800))
-        psraw   mm5,1                   ; mm5=(CbO * -FIX(0.22800))
-        paddw   mm0,[GOTOFF(eax,PW_ONE)]
-        paddw   mm1,[GOTOFF(eax,PW_ONE)]
-        psraw   mm0,1                   ; mm0=(CrE * FIX(0.40200))
-        psraw   mm1,1                   ; mm1=(CrO * FIX(0.40200))
-
-        paddw   mm4,mm2
-        paddw   mm5,mm3
-        paddw   mm4,mm2                 ; mm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   mm5,mm3                 ; mm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   mm0,mm6                 ; mm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   mm1,mm7                 ; mm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(B-Y)E
-        movq    MMWORD [wk(1)], mm5     ; wk(1)=(B-Y)O
-
-        movq      mm4,mm2
-        movq      mm5,mm3
-        punpcklwd mm2,mm6
-        punpckhwd mm4,mm6
-        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd mm3,mm7
-        punpckhwd mm5,mm7
-        pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm2,SCALEBITS
-        psrad     mm4,SCALEBITS
-        paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm3,SCALEBITS
-        psrad     mm5,SCALEBITS
-
-        packssdw  mm2,mm4       ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  mm3,mm5       ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     mm2,mm6       ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     mm3,mm7       ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movq      mm5, MMWORD [esi]     ; mm5=Y(01234567)
-
-        pcmpeqw   mm4,mm4
-        psrlw     mm4,BYTE_BIT          ; mm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      mm4,mm5               ; mm4=Y(0246)=YE
-        psrlw     mm5,BYTE_BIT          ; mm5=Y(1357)=YO
-
-        paddw     mm0,mm4               ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
-        paddw     mm1,mm5               ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
-        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
-        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-        paddw     mm2,mm4               ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
-        paddw     mm3,mm5               ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
-        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
-        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-        paddw     mm4, MMWORD [wk(0)]   ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
-        paddw     mm5, MMWORD [wk(1)]   ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
-        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
-        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
-        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
-
-        movq      mmG,mmA
-        movq      mmH,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
-        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
-
-        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
-        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
-
-        movq      mmC,mmD
-        movq      mmB,mmD
-        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
-        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
-
-        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
-
-        movq      mmF,mmE
-        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
-        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
-
-        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
-        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
-        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_MMWORD
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    mmA,mmC
-        sub     ecx, byte 2*SIZEOF_MMWORD
-        add     edi, byte 2*SIZEOF_MMWORD
-        jmp     short .column_st4
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmE
-        sub     ecx, byte SIZEOF_MMWORD
-        add     edi, byte SIZEOF_MMWORD
-.column_st4:
-        movd    eax,mmA
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st2
-        mov     DWORD [edi+0*SIZEOF_DWORD], eax
-        psrlq   mmA,DWORD_BIT
-        movd    eax,mmA
-        sub     ecx, byte SIZEOF_DWORD
-        add     edi, byte SIZEOF_DWORD
-.column_st2:
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi+0*SIZEOF_WORD], ax
-        shr     eax,WORD_BIT
-        sub     ecx, byte SIZEOF_WORD
-        add     edi, byte SIZEOF_WORD
-.column_st1:
-        cmp     ecx, byte SIZEOF_BYTE
-        jb      short .nextrow
-        mov     BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
-        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
-        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
-
-        movq      mmC,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
-        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
-        movq      mmG,mmB
-        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
-        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
-
-        movq      mmD,mmA
-        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
-        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
-        movq      mmH,mmC
-        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
-        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        cmp     ecx, byte SIZEOF_MMWORD/2
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    mmA,mmC
-        movq    mmD,mmH
-        sub     ecx, byte SIZEOF_MMWORD/2
-        add     edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD/4
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmD
-        sub     ecx, byte SIZEOF_MMWORD/4
-        add     edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-        cmp     ecx, byte SIZEOF_MMWORD/8
-        jb      short .nextrow
-        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        alignx  16,7
-
-.nextrow:
-        pop     ecx
-        pop     esi
-        pop     ebx
-        pop     edx
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolext-sse2-64.asm b/media/libjpeg/simd/jdcolext-sse2-64.asm
deleted file mode 100644
index 4634066c45..0000000000
--- a/media/libjpeg/simd/jdcolext-sse2-64.asm
+++ /dev/null
@@ -1,440 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-; r10 = JDIMENSION out_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION input_row
-; r13 = JSAMPARRAY output_buf
-; r14 = int num_rows
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d        ; num_cols
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
-        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
-
-        pop     rcx
-
-        mov     rdi, r13
-        mov     eax, r14d
-        test    rax,rax
-        jle     near .return
-.rowloop:
-        push    rax
-        push    rdi
-        push    rdx
-        push    rbx
-        push    rsi
-        push    rcx                     ; col
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr0
-        mov     rbx, JSAMPROW [rbx]     ; inptr1
-        mov     rdx, JSAMPROW [rdx]     ; inptr2
-        mov     rdi, JSAMPROW [rdi]     ; outptr
-.columnloop:
-
-        movdqa  xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
-        movdqa  xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
-
-        pcmpeqw xmm4,xmm4
-        pcmpeqw xmm7,xmm7
-        psrlw   xmm4,BYTE_BIT
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
-        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
-        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
-        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
-
-        paddw   xmm4,xmm7
-        paddw   xmm5,xmm7
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm2,xmm4               ; xmm2=CbE
-        movdqa  xmm3,xmm5               ; xmm3=CbO
-        paddw   xmm4,xmm4               ; xmm4=2*CbE
-        paddw   xmm5,xmm5               ; xmm5=2*CbO
-        movdqa  xmm6,xmm0               ; xmm6=CrE
-        movdqa  xmm7,xmm1               ; xmm7=CrO
-        paddw   xmm0,xmm0               ; xmm0=2*CrE
-        paddw   xmm1,xmm1               ; xmm1=2*CrO
-
-        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbE * -FIX(0.22800))
-        pmulhw  xmm5,[rel PW_MF0228]    ; xmm5=(2*CbO * -FIX(0.22800))
-        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrE * FIX(0.40200))
-        pmulhw  xmm1,[rel PW_F0402]     ; xmm1=(2*CrO * FIX(0.40200))
-
-        paddw   xmm4,[rel PW_ONE]
-        paddw   xmm5,[rel PW_ONE]
-        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
-        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
-        paddw   xmm0,[rel PW_ONE]
-        paddw   xmm1,[rel PW_ONE]
-        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
-        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
-
-        paddw   xmm4,xmm2
-        paddw   xmm5,xmm3
-        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm5,xmm3
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm4,xmm6
-        pmaddwd   xmm2,[rel PW_MF0344_F0285]
-        pmaddwd   xmm4,[rel PW_MF0344_F0285]
-        punpcklwd xmm3,xmm7
-        punpckhwd xmm5,xmm7
-        pmaddwd   xmm3,[rel PW_MF0344_F0285]
-        pmaddwd   xmm5,[rel PW_MF0344_F0285]
-
-        paddd     xmm2,[rel PD_ONEHALF]
-        paddd     xmm4,[rel PD_ONEHALF]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm4,SCALEBITS
-        paddd     xmm3,[rel PD_ONEHALF]
-        paddd     xmm5,[rel PD_ONEHALF]
-        psrad     xmm3,SCALEBITS
-        psrad     xmm5,SCALEBITS
-
-        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movdqa    xmm5, XMMWORD [rsi]   ; xmm5=Y(0123456789ABCDEF)
-
-        pcmpeqw   xmm4,xmm4
-        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
-        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
-
-        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     rcx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     rcx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_MMWORD
-        sub     rcx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_DWORD
-        sub     rcx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of rax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     rcx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [rdi], ax
-        add     rdi, byte SIZEOF_WORD
-        sub     rcx, byte SIZEOF_WORD
-        shr     rax, 16
-.column_st1:
-        ; Store the lower 1 byte of rax to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .nextrow
-        mov     BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        cmp     rcx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_XMMWORD/8*4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .nextrow
-        movd    XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.nextrow:
-        pop     rcx
-        pop     rsi
-        pop     rbx
-        pop     rdx
-        pop     rdi
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW
-        add     rbx, byte SIZEOF_JSAMPROW
-        add     rdx, byte SIZEOF_JSAMPROW
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     rax                             ; num_rows
-        jg      near .rowloop
-
-        sfence          ; flush the write buffer
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolext-sse2.asm b/media/libjpeg/simd/jdcolext-sse2.asm
deleted file mode 100644
index 682aef35fc..0000000000
--- a/media/libjpeg/simd/jdcolext-sse2.asm
+++ /dev/null
@@ -1,459 +0,0 @@
-;
-; jdcolext.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Convert some rows of samples to the output colorspace.
-;
-; GLOBAL(void)
-; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width,
-;                             JSAMPIMAGE input_buf, JDIMENSION input_row,
-;                             JSAMPARRAY output_buf, int num_rows)
-;
-
-%define out_width(b)    (b)+8           ; JDIMENSION out_width
-%define input_buf(b)    (b)+12          ; JSAMPIMAGE input_buf
-%define input_row(b)    (b)+16          ; JDIMENSION input_row
-%define output_buf(b)   (b)+20          ; JSAMPARRAY output_buf
-%define num_rows(b)     (b)+24          ; int num_rows
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_ycc_rgb_convert_sse2)
-
-EXTN(jsimd_ycc_rgb_convert_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [out_width(eax)]        ; num_cols
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [input_row(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
-        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
-
-        pop     ecx
-
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     eax, INT [num_rows(eax)]
-        test    eax,eax
-        jle     near .return
-        alignx  16,7
-.rowloop:
-        push    eax
-        push    edi
-        push    edx
-        push    ebx
-        push    esi
-        push    ecx                     ; col
-
-        mov     esi, JSAMPROW [esi]     ; inptr0
-        mov     ebx, JSAMPROW [ebx]     ; inptr1
-        mov     edx, JSAMPROW [edx]     ; inptr2
-        mov     edi, JSAMPROW [edi]     ; outptr
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm5, XMMWORD [ebx]     ; xmm5=Cb(0123456789ABCDEF)
-        movdqa  xmm1, XMMWORD [edx]     ; xmm1=Cr(0123456789ABCDEF)
-
-        pcmpeqw xmm4,xmm4
-        pcmpeqw xmm7,xmm7
-        psrlw   xmm4,BYTE_BIT
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-        movdqa  xmm0,xmm4               ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
-
-        pand    xmm4,xmm5               ; xmm4=Cb(02468ACE)=CbE
-        psrlw   xmm5,BYTE_BIT           ; xmm5=Cb(13579BDF)=CbO
-        pand    xmm0,xmm1               ; xmm0=Cr(02468ACE)=CrE
-        psrlw   xmm1,BYTE_BIT           ; xmm1=Cr(13579BDF)=CrO
-
-        paddw   xmm4,xmm7
-        paddw   xmm5,xmm7
-        paddw   xmm0,xmm7
-        paddw   xmm1,xmm7
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm2,xmm4               ; xmm2=CbE
-        movdqa  xmm3,xmm5               ; xmm3=CbO
-        paddw   xmm4,xmm4               ; xmm4=2*CbE
-        paddw   xmm5,xmm5               ; xmm5=2*CbO
-        movdqa  xmm6,xmm0               ; xmm6=CrE
-        movdqa  xmm7,xmm1               ; xmm7=CrO
-        paddw   xmm0,xmm0               ; xmm0=2*CrE
-        paddw   xmm1,xmm1               ; xmm1=2*CrO
-
-        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbE * -FIX(0.22800))
-        pmulhw  xmm5,[GOTOFF(eax,PW_MF0228)]    ; xmm5=(2*CbO * -FIX(0.22800))
-        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrE * FIX(0.40200))
-        pmulhw  xmm1,[GOTOFF(eax,PW_F0402)]     ; xmm1=(2*CrO * FIX(0.40200))
-
-        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm5,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm4,1                  ; xmm4=(CbE * -FIX(0.22800))
-        psraw   xmm5,1                  ; xmm5=(CbO * -FIX(0.22800))
-        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm1,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm0,1                  ; xmm0=(CrE * FIX(0.40200))
-        psraw   xmm1,1                  ; xmm1=(CrO * FIX(0.40200))
-
-        paddw   xmm4,xmm2
-        paddw   xmm5,xmm3
-        paddw   xmm4,xmm2               ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
-        paddw   xmm5,xmm3               ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
-        paddw   xmm0,xmm6               ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
-        paddw   xmm1,xmm7               ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm5,xmm3
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm4,xmm6
-        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm4,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd xmm3,xmm7
-        punpckhwd xmm5,xmm7
-        pmaddwd   xmm3,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm4,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm4,SCALEBITS
-        paddd     xmm3,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm3,SCALEBITS
-        psrad     xmm5,SCALEBITS
-
-        packssdw  xmm2,xmm4     ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
-        packssdw  xmm3,xmm5     ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
-        psubw     xmm2,xmm6     ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
-        psubw     xmm3,xmm7     ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
-
-        movdqa    xmm5, XMMWORD [esi]   ; xmm5=Y(0123456789ABCDEF)
-
-        pcmpeqw   xmm4,xmm4
-        psrlw     xmm4,BYTE_BIT         ; xmm4={0xFF 0x00 0xFF 0x00 ..}
-        pand      xmm4,xmm5             ; xmm4=Y(02468ACE)=YE
-        psrlw     xmm5,BYTE_BIT         ; xmm5=Y(13579BDF)=YO
-
-        paddw     xmm0,xmm4             ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm5             ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm4             ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm5             ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
-        paddw     xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     ecx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_MMWORD
-        sub     ecx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [edi], xmmA
-        add     edi, byte SIZEOF_DWORD
-        sub     ecx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of eax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi], ax
-        add     edi, byte SIZEOF_WORD
-        sub     ecx, byte SIZEOF_WORD
-        shr     eax, 16
-.column_st1:
-        ; Store the lower 1 byte of eax to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .nextrow
-        mov     BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .nextrow
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        cmp     ecx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_XMMWORD/8*4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .nextrow
-        movd    XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-        alignx  16,7
-
-.nextrow:
-        pop     ecx
-        pop     esi
-        pop     ebx
-        pop     edx
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW
-        add     ebx, byte SIZEOF_JSAMPROW
-        add     edx, byte SIZEOF_JSAMPROW
-        add     edi, byte SIZEOF_JSAMPROW       ; output_buf
-        dec     eax                             ; num_rows
-        jg      near .rowloop
-
-        sfence          ; flush the write buffer
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdcolor-altivec.c b/media/libjpeg/simd/jdcolor-altivec.c
deleted file mode 100644
index 0dc4c427c2..0000000000
--- a/media/libjpeg/simd/jdcolor-altivec.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* YCC --> RGB CONVERSION */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554              /* FIX(0.34414) */
-#define F_0_714 46802              /* FIX(0.71414) */
-#define F_1_402 91881              /* FIX(1.40200) */
-#define F_1_772 116130             /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extrgbx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extbgrx_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxbgr_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
-#define jsimd_ycc_rgb_convert_altivec jsimd_ycc_extxrgb_convert_altivec
-#include "jdcolext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/jdcolor-mmx.asm b/media/libjpeg/simd/jdcolor-mmx.asm
deleted file mode 100644
index 4e58031dd0..0000000000
--- a/media/libjpeg/simd/jdcolor-mmx.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_mmx)
-
-EXTN(jconst_ycc_rgb_convert_mmx):
-
-PW_F0402        times 4 dw  F_0_402
-PW_MF0228       times 4 dw -F_0_228
-PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
-PW_ONE          times 4 dw  1
-PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgb_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extrgbx_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgr_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extbgrx_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxbgr_convert_mmx
-%include "jdcolext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_mmx jsimd_ycc_extxrgb_convert_mmx
-%include "jdcolext-mmx.asm"
diff --git a/media/libjpeg/simd/jdcolor-sse2-64.asm b/media/libjpeg/simd/jdcolor-sse2-64.asm
deleted file mode 100644
index d2bf210007..0000000000
--- a/media/libjpeg/simd/jdcolor-sse2-64.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_sse2)
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdcolext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdcolext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jdcolor-sse2.asm b/media/libjpeg/simd/jdcolor-sse2.asm
deleted file mode 100644
index 7ff5d05d0c..0000000000
--- a/media/libjpeg/simd/jdcolor-sse2.asm
+++ /dev/null
@@ -1,119 +0,0 @@
-;
-; jdcolor.asm - colorspace conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_ycc_rgb_convert_sse2)
-
-EXTN(jconst_ycc_rgb_convert_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgb_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extrgbx_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgr_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extbgrx_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxbgr_convert_sse2
-%include "jdcolext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_ycc_rgb_convert_sse2 jsimd_ycc_extxrgb_convert_sse2
-%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/jdmerge-altivec.c b/media/libjpeg/simd/jdmerge-altivec.c
deleted file mode 100644
index 6a35f2019c..0000000000
--- a/media/libjpeg/simd/jdmerge-altivec.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_344 22554              /* FIX(0.34414) */
-#define F_0_714 46802              /* FIX(0.71414) */
-#define F_1_402 91881              /* FIX(1.40200) */
-#define F_1_772 116130             /* FIX(1.77200) */
-#define F_0_402 (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
-#define F_0_285 (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
-#define F_0_228 (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
-
-#define SCALEBITS 16
-#define ONE_HALF (1 << (SCALEBITS - 1))
-
-#define RGB_INDEX0 {0,1,8,2,3,10,4,5,12,6,7,14,16,17,24,18}
-#define RGB_INDEX1 {3,10,4,5,12,6,7,14,16,17,24,18,19,26,20,21}
-#define RGB_INDEX2 {12,6,7,14,16,17,24,18,19,26,20,21,28,22,23,30}
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-
-#define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-#define RGB_INDEX {0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extrgbx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extrgbx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-#define RGB_INDEX0 {8,1,0,10,3,2,12,5,4,14,7,6,24,17,16,26}
-#define RGB_INDEX1 {3,2,12,5,4,14,7,6,24,17,16,26,19,18,28,21}
-#define RGB_INDEX2 {4,14,7,6,24,17,16,26,19,18,28,21,20,30,23,22}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX0
-#undef RGB_INDEX1
-#undef RGB_INDEX2
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-#define RGB_INDEX {8,1,0,9,10,3,2,11,12,5,4,13,14,7,6,15}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extbgrx_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extbgrx_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-#define RGB_INDEX {9,8,1,0,11,10,3,2,13,12,5,4,15,14,7,6}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxbgr_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxbgr_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
-
-#define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-#define RGB_INDEX {9,0,1,8,11,2,3,10,13,4,5,12,15,6,7,14}
-#define jsimd_h2v1_merged_upsample_altivec jsimd_h2v1_extxrgb_merged_upsample_altivec
-#define jsimd_h2v2_merged_upsample_altivec jsimd_h2v2_extxrgb_merged_upsample_altivec
-#include "jdmrgext-altivec.c"
-#undef RGB_PIXELSIZE
-#undef RGB_INDEX
-#undef jsimd_h2v1_merged_upsample_altivec
-#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/jdmerge-mmx.asm b/media/libjpeg/simd/jdmerge-mmx.asm
deleted file mode 100644
index ee58bff1c6..0000000000
--- a/media/libjpeg/simd/jdmerge-mmx.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_merged_upsample_mmx)
-
-EXTN(jconst_merged_upsample_mmx):
-
-PW_F0402        times 4 dw  F_0_402
-PW_MF0228       times 4 dw -F_0_228
-PW_MF0344_F0285 times 2 dw -F_0_344, F_0_285
-PW_ONE          times 4 dw  1
-PD_ONEHALF      times 2 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
-%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
-%include "jdmrgext-mmx.asm"
diff --git a/media/libjpeg/simd/jdmerge-sse2-64.asm b/media/libjpeg/simd/jdmerge-sse2-64.asm
deleted file mode 100644
index 244bd40234..0000000000
--- a/media/libjpeg/simd/jdmerge-sse2-64.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_merged_upsample_sse2)
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgext-sse2-64.asm"
diff --git a/media/libjpeg/simd/jdmerge-sse2.asm b/media/libjpeg/simd/jdmerge-sse2.asm
deleted file mode 100644
index 236de5a385..0000000000
--- a/media/libjpeg/simd/jdmerge-sse2.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; jdmerge.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-
-%define SCALEBITS       16
-
-F_0_344 equ      22554                  ; FIX(0.34414)
-F_0_714 equ      46802                  ; FIX(0.71414)
-F_1_402 equ      91881                  ; FIX(1.40200)
-F_1_772 equ     116130                  ; FIX(1.77200)
-F_0_402 equ     (F_1_402 - 65536)       ; FIX(1.40200) - FIX(1)
-F_0_285 equ     ( 65536 - F_0_714)      ; FIX(1) - FIX(0.71414)
-F_0_228 equ     (131072 - F_1_772)      ; FIX(2) - FIX(1.77200)
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_merged_upsample_sse2)
-
-EXTN(jconst_merged_upsample_sse2):
-
-PW_F0402        times 8 dw  F_0_402
-PW_MF0228       times 8 dw -F_0_228
-PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
-PW_ONE          times 8 dw  1
-PD_ONEHALF      times 4 dd  1 << (SCALEBITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGB_RED
-%define RGB_GREEN EXT_RGB_GREEN
-%define RGB_BLUE EXT_RGB_BLUE
-%define RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_RGBX_RED
-%define RGB_GREEN EXT_RGBX_GREEN
-%define RGB_BLUE EXT_RGBX_BLUE
-%define RGB_PIXELSIZE EXT_RGBX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGR_RED
-%define RGB_GREEN EXT_BGR_GREEN
-%define RGB_BLUE EXT_BGR_BLUE
-%define RGB_PIXELSIZE EXT_BGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_BGRX_RED
-%define RGB_GREEN EXT_BGRX_GREEN
-%define RGB_BLUE EXT_BGRX_BLUE
-%define RGB_PIXELSIZE EXT_BGRX_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XBGR_RED
-%define RGB_GREEN EXT_XBGR_GREEN
-%define RGB_BLUE EXT_XBGR_BLUE
-%define RGB_PIXELSIZE EXT_XBGR_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
-
-%undef RGB_RED
-%undef RGB_GREEN
-%undef RGB_BLUE
-%undef RGB_PIXELSIZE
-%define RGB_RED EXT_XRGB_RED
-%define RGB_GREEN EXT_XRGB_GREEN
-%define RGB_BLUE EXT_XRGB_BLUE
-%define RGB_PIXELSIZE EXT_XRGB_PIXELSIZE
-%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
-%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
-%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/jdmrgext-mmx.asm b/media/libjpeg/simd/jdmrgext-mmx.asm
deleted file mode 100644
index 63f45cf373..0000000000
--- a/media/libjpeg/simd/jdmrgext-mmx.asm
+++ /dev/null
@@ -1,463 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          3
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-EXTN(jsimd_h2v1_merged_upsample_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [output_width(eax)]     ; col
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     edi, JSAMPROW [edi]                             ; outptr
-
-        pop     ecx                     ; col
-
-        alignx  16,7
-.columnloop:
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        movq      mm6, MMWORD [ebx]     ; mm6=Cb(01234567)
-        movq      mm7, MMWORD [edx]     ; mm7=Cr(01234567)
-
-        pxor      mm1,mm1               ; mm1=(all 0's)
-        pcmpeqw   mm3,mm3
-        psllw     mm3,7                 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
-
-        movq      mm4,mm6
-        punpckhbw mm6,mm1               ; mm6=Cb(4567)=CbH
-        punpcklbw mm4,mm1               ; mm4=Cb(0123)=CbL
-        movq      mm0,mm7
-        punpckhbw mm7,mm1               ; mm7=Cr(4567)=CrH
-        punpcklbw mm0,mm1               ; mm0=Cr(0123)=CrL
-
-        paddw     mm6,mm3
-        paddw     mm4,mm3
-        paddw     mm7,mm3
-        paddw     mm0,mm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movq    mm5,mm6                 ; mm5=CbH
-        movq    mm2,mm4                 ; mm2=CbL
-        paddw   mm6,mm6                 ; mm6=2*CbH
-        paddw   mm4,mm4                 ; mm4=2*CbL
-        movq    mm1,mm7                 ; mm1=CrH
-        movq    mm3,mm0                 ; mm3=CrL
-        paddw   mm7,mm7                 ; mm7=2*CrH
-        paddw   mm0,mm0                 ; mm0=2*CrL
-
-        pmulhw  mm6,[GOTOFF(eax,PW_MF0228)]     ; mm6=(2*CbH * -FIX(0.22800))
-        pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbL * -FIX(0.22800))
-        pmulhw  mm7,[GOTOFF(eax,PW_F0402)]      ; mm7=(2*CrH * FIX(0.40200))
-        pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrL * FIX(0.40200))
-
-        paddw   mm6,[GOTOFF(eax,PW_ONE)]
-        paddw   mm4,[GOTOFF(eax,PW_ONE)]
-        psraw   mm6,1                   ; mm6=(CbH * -FIX(0.22800))
-        psraw   mm4,1                   ; mm4=(CbL * -FIX(0.22800))
-        paddw   mm7,[GOTOFF(eax,PW_ONE)]
-        paddw   mm0,[GOTOFF(eax,PW_ONE)]
-        psraw   mm7,1                   ; mm7=(CrH * FIX(0.40200))
-        psraw   mm0,1                   ; mm0=(CrL * FIX(0.40200))
-
-        paddw   mm6,mm5
-        paddw   mm4,mm2
-        paddw   mm6,mm5                 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   mm4,mm2                 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   mm7,mm1                 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   mm0,mm3                 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movq    MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
-        movq    MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
-
-        movq      mm6,mm5
-        movq      mm7,mm2
-        punpcklwd mm5,mm1
-        punpckhwd mm6,mm1
-        pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd mm2,mm3
-        punpckhwd mm7,mm3
-        pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm5,SCALEBITS
-        psrad     mm6,SCALEBITS
-        paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     mm2,SCALEBITS
-        psrad     mm7,SCALEBITS
-
-        packssdw  mm5,mm6       ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  mm2,mm7       ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     mm5,mm1       ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     mm2,mm3       ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movq    MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-        alignx  16,7
-
-.Yloop_2nd:
-        movq    mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
-        movq    mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
-        movq    mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
-        alignx  16,7
-
-.Yloop_1st:
-        movq    mm7, MMWORD [esi]       ; mm7=Y(01234567)
-
-        pcmpeqw mm6,mm6
-        psrlw   mm6,BYTE_BIT            ; mm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    mm6,mm7                 ; mm6=Y(0246)=YE
-        psrlw   mm7,BYTE_BIT            ; mm7=Y(1357)=YO
-
-        movq    mm1,mm0                 ; mm1=mm0=(R-Y)(L/H)
-        movq    mm3,mm2                 ; mm3=mm2=(G-Y)(L/H)
-        movq    mm5,mm4                 ; mm5=mm4=(B-Y)(L/H)
-
-        paddw     mm0,mm6               ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
-        paddw     mm1,mm7               ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
-        packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
-        packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-        paddw     mm2,mm6               ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
-        paddw     mm3,mm7               ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
-        packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
-        packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-        paddw     mm4,mm6               ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
-        paddw     mm5,mm7               ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
-        packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
-        packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
-        punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
-
-        movq      mmG,mmA
-        movq      mmH,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
-        punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
-
-        psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
-        psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
-
-        movq      mmC,mmD
-        movq      mmB,mmD
-        punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
-        punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
-
-        psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
-
-        movq      mmF,mmE
-        punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
-        punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
-
-        punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
-        punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
-        punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      near .endcolumn
-
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_MMWORD
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-        movq    mmA,mmC
-        sub     ecx, byte 2*SIZEOF_MMWORD
-        add     edi, byte 2*SIZEOF_MMWORD
-        jmp     short .column_st4
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmE
-        sub     ecx, byte SIZEOF_MMWORD
-        add     edi, byte SIZEOF_MMWORD
-.column_st4:
-        movd    eax,mmA
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st2
-        mov     DWORD [edi+0*SIZEOF_DWORD], eax
-        psrlq   mmA,DWORD_BIT
-        movd    eax,mmA
-        sub     ecx, byte SIZEOF_DWORD
-        add     edi, byte SIZEOF_DWORD
-.column_st2:
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi+0*SIZEOF_WORD], ax
-        shr     eax,WORD_BIT
-        sub     ecx, byte SIZEOF_WORD
-        add     edi, byte SIZEOF_WORD
-.column_st1:
-        cmp     ecx, byte SIZEOF_BYTE
-        jb      short .endcolumn
-        mov     BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-        pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-        pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-        ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-        ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-        ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-        ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-        punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-        punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
-        punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
-        punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
-
-        movq      mmC,mmA
-        punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
-        punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
-        movq      mmG,mmB
-        punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
-        punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
-
-        movq      mmD,mmA
-        punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
-        punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
-        movq      mmH,mmC
-        punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
-        punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
-
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st16
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-        sub     ecx, byte SIZEOF_MMWORD
-        jz      short .endcolumn
-
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-        add     esi, byte SIZEOF_MMWORD                 ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-        add     edx, byte SIZEOF_MMWORD                 ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st16:
-        cmp     ecx, byte SIZEOF_MMWORD/2
-        jb      short .column_st8
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-        movq    mmA,mmC
-        movq    mmD,mmH
-        sub     ecx, byte SIZEOF_MMWORD/2
-        add     edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-        cmp     ecx, byte SIZEOF_MMWORD/4
-        jb      short .column_st4
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-        movq    mmA,mmD
-        sub     ecx, byte SIZEOF_MMWORD/4
-        add     edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-        cmp     ecx, byte SIZEOF_MMWORD/8
-        jb      short .endcolumn
-        movd    DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_mmx)
-
-EXTN(jsimd_h2v2_merged_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     eax, JDIMENSION [output_width(ebp)]
-
-        mov     edi, JSAMPIMAGE [input_buf(ebp)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(ebp)]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-        push    edx                     ; inptr2
-        push    ebx                     ; inptr1
-        push    esi                     ; inptr00
-        mov     ebx,esp
-
-        push    edi                     ; output_buf (outptr0)
-        push    ecx                     ; in_row_group_ctr
-        push    ebx                     ; input_buf
-        push    eax                     ; output_width
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-        add     esp, byte 7*SIZEOF_DWORD
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdmrgext-sse2-64.asm b/media/libjpeg/simd/jdmrgext-sse2-64.asm
deleted file mode 100644
index ad74c5ff4d..0000000000
--- a/media/libjpeg/simd/jdmrgext-sse2-64.asm
+++ /dev/null
@@ -1,537 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          3
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     ecx, r10d        ; col
-        test    rcx,rcx
-        jz      near .return
-
-        push    rcx
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        mov     rdi, r13
-        mov     rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     rdi, JSAMPROW [rdi]                             ; outptr
-
-        pop     rcx                     ; col
-
-.columnloop:
-
-        movdqa    xmm6, XMMWORD [rbx]   ; xmm6=Cb(0123456789ABCDEF)
-        movdqa    xmm7, XMMWORD [rdx]   ; xmm7=Cr(0123456789ABCDEF)
-
-        pxor      xmm1,xmm1             ; xmm1=(all 0's)
-        pcmpeqw   xmm3,xmm3
-        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        movdqa    xmm4,xmm6
-        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
-        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
-        movdqa    xmm0,xmm7
-        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
-        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
-
-        paddw     xmm6,xmm3
-        paddw     xmm4,xmm3
-        paddw     xmm7,xmm3
-        paddw     xmm0,xmm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm5,xmm6               ; xmm5=CbH
-        movdqa  xmm2,xmm4               ; xmm2=CbL
-        paddw   xmm6,xmm6               ; xmm6=2*CbH
-        paddw   xmm4,xmm4               ; xmm4=2*CbL
-        movdqa  xmm1,xmm7               ; xmm1=CrH
-        movdqa  xmm3,xmm0               ; xmm3=CrL
-        paddw   xmm7,xmm7               ; xmm7=2*CrH
-        paddw   xmm0,xmm0               ; xmm0=2*CrL
-
-        pmulhw  xmm6,[rel PW_MF0228]    ; xmm6=(2*CbH * -FIX(0.22800))
-        pmulhw  xmm4,[rel PW_MF0228]    ; xmm4=(2*CbL * -FIX(0.22800))
-        pmulhw  xmm7,[rel PW_F0402]     ; xmm7=(2*CrH * FIX(0.40200))
-        pmulhw  xmm0,[rel PW_F0402]     ; xmm0=(2*CrL * FIX(0.40200))
-
-        paddw   xmm6,[rel PW_ONE]
-        paddw   xmm4,[rel PW_ONE]
-        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
-        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
-        paddw   xmm7,[rel PW_ONE]
-        paddw   xmm0,[rel PW_ONE]
-        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
-        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
-
-        paddw   xmm6,xmm5
-        paddw   xmm4,xmm2
-        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
-
-        movdqa    xmm6,xmm5
-        movdqa    xmm7,xmm2
-        punpcklwd xmm5,xmm1
-        punpckhwd xmm6,xmm1
-        pmaddwd   xmm5,[rel PW_MF0344_F0285]
-        pmaddwd   xmm6,[rel PW_MF0344_F0285]
-        punpcklwd xmm2,xmm3
-        punpckhwd xmm7,xmm3
-        pmaddwd   xmm2,[rel PW_MF0344_F0285]
-        pmaddwd   xmm7,[rel PW_MF0344_F0285]
-
-        paddd     xmm5,[rel PD_ONEHALF]
-        paddd     xmm6,[rel PD_ONEHALF]
-        psrad     xmm5,SCALEBITS
-        psrad     xmm6,SCALEBITS
-        paddd     xmm2,[rel PD_ONEHALF]
-        paddd     xmm7,[rel PD_ONEHALF]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm7,SCALEBITS
-
-        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-
-.Yloop_2nd:
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-
-.Yloop_1st:
-        movdqa  xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
-
-        pcmpeqw xmm6,xmm6
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
-        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
-
-        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
-        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
-        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
-
-        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     rcx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     rcx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_MMWORD
-        sub     rcx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_DWORD
-        sub     rcx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of rax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     rcx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [rdi], ax
-        add     rdi, byte SIZEOF_WORD
-        sub     rcx, byte SIZEOF_WORD
-        shr     rax, 16
-.column_st1:
-        ; Store the lower 1 byte of rax to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .endcolumn
-        mov     BYTE [rdi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     rcx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    rdi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     rcx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     rsi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     rbx, byte SIZEOF_XMMWORD        ; inptr1
-        add     rdx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-
-.column_st32:
-        cmp     rcx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     rcx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     rcx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
-        add     rdi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     rcx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     rcx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [rdi], xmmA
-        add     rdi, byte SIZEOF_XMMWORD/8*4
-        sub     rcx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    rcx, rcx
-        jz      short .endcolumn
-        movd    XMM_DWORD [rdi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        sfence          ; flush the write buffer
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-; r10 = JDIMENSION output_width
-; r11 = JSAMPIMAGE input_buf
-; r12 = JDIMENSION in_row_group_ctr
-; r13 = JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        mov     eax, r10d
-
-        mov     rdi, r11
-        mov     ecx, r12d
-        mov     rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-        mov     rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-        mov     rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
-        mov     rdi, r13
-        lea     rsi, [rsi+rcx*SIZEOF_JSAMPROW]
-
-        push    rdx                     ; inptr2
-        push    rbx                     ; inptr1
-        push    rsi                     ; inptr00
-        mov     rbx,rsp
-
-        push    rdi
-        push    rcx
-        push    rax
-
-        %ifdef WIN64
-        mov r8, rcx
-        mov r9, rdi
-        mov rcx, rax
-        mov rdx, rbx
-        %else
-        mov rdx, rcx
-        mov rcx, rdi
-        mov     rdi, rax
-        mov rsi, rbx
-        %endif
-
-        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        pop rax
-        pop rcx
-        pop rdi
-        pop rsi
-        pop rbx
-        pop rdx
-
-        add     rdi, byte SIZEOF_JSAMPROW       ; outptr1
-        add     rsi, byte SIZEOF_JSAMPROW       ; inptr01
-
-        push    rdx                     ; inptr2
-        push    rbx                     ; inptr1
-        push    rsi                     ; inptr00
-        mov     rbx,rsp
-
-        push    rdi
-        push    rcx
-        push    rax
-
-        %ifdef WIN64
-        mov r8, rcx
-        mov r9, rdi
-        mov rcx, rax
-        mov rdx, rbx
-        %else
-        mov rdx, rcx
-        mov rcx, rdi
-        mov     rdi, rax
-        mov rsi, rbx
-        %endif
-
-        call    EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        pop rax
-        pop rcx
-        pop rdi
-        pop rsi
-        pop rbx
-        pop rdx
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdmrgext-sse2.asm b/media/libjpeg/simd/jdmrgext-sse2.asm
deleted file mode 100644
index b50f698b49..0000000000
--- a/media/libjpeg/simd/jdmrgext-sse2.asm
+++ /dev/null
@@ -1,518 +0,0 @@
-;
-; jdmrgext.asm - merged upsampling/color conversion (SSE2)
-;
-; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2012, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jcolsamp.inc"
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          3
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     ecx, JDIMENSION [output_width(eax)]     ; col
-        test    ecx,ecx
-        jz      near .return
-
-        push    ecx
-
-        mov     edi, JSAMPIMAGE [input_buf(eax)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(eax)]
-        mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-        mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-        mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-        mov     edi, JSAMPROW [edi]                             ; outptr
-
-        pop     ecx                     ; col
-
-        alignx  16,7
-.columnloop:
-        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-        movdqa    xmm6, XMMWORD [ebx]   ; xmm6=Cb(0123456789ABCDEF)
-        movdqa    xmm7, XMMWORD [edx]   ; xmm7=Cr(0123456789ABCDEF)
-
-        pxor      xmm1,xmm1             ; xmm1=(all 0's)
-        pcmpeqw   xmm3,xmm3
-        psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        movdqa    xmm4,xmm6
-        punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
-        punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
-        movdqa    xmm0,xmm7
-        punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
-        punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
-
-        paddw     xmm6,xmm3
-        paddw     xmm4,xmm3
-        paddw     xmm7,xmm3
-        paddw     xmm0,xmm3
-
-        ; (Original)
-        ; R = Y                + 1.40200 * Cr
-        ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-        ; B = Y + 1.77200 * Cb
-        ;
-        ; (This implementation)
-        ; R = Y                + 0.40200 * Cr + Cr
-        ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-        ; B = Y - 0.22800 * Cb + Cb + Cb
-
-        movdqa  xmm5,xmm6               ; xmm5=CbH
-        movdqa  xmm2,xmm4               ; xmm2=CbL
-        paddw   xmm6,xmm6               ; xmm6=2*CbH
-        paddw   xmm4,xmm4               ; xmm4=2*CbL
-        movdqa  xmm1,xmm7               ; xmm1=CrH
-        movdqa  xmm3,xmm0               ; xmm3=CrL
-        paddw   xmm7,xmm7               ; xmm7=2*CrH
-        paddw   xmm0,xmm0               ; xmm0=2*CrL
-
-        pmulhw  xmm6,[GOTOFF(eax,PW_MF0228)]    ; xmm6=(2*CbH * -FIX(0.22800))
-        pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbL * -FIX(0.22800))
-        pmulhw  xmm7,[GOTOFF(eax,PW_F0402)]     ; xmm7=(2*CrH * FIX(0.40200))
-        pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrL * FIX(0.40200))
-
-        paddw   xmm6,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm4,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
-        psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
-        paddw   xmm7,[GOTOFF(eax,PW_ONE)]
-        paddw   xmm0,[GOTOFF(eax,PW_ONE)]
-        psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
-        psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
-
-        paddw   xmm6,xmm5
-        paddw   xmm4,xmm2
-        paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-        paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-        paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-        paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
-
-        movdqa    xmm6,xmm5
-        movdqa    xmm7,xmm2
-        punpcklwd xmm5,xmm1
-        punpckhwd xmm6,xmm1
-        pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
-        punpcklwd xmm2,xmm3
-        punpckhwd xmm7,xmm3
-        pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-        pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-        paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm5,SCALEBITS
-        psrad     xmm6,SCALEBITS
-        paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-        paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
-        psrad     xmm2,SCALEBITS
-        psrad     xmm7,SCALEBITS
-
-        packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-        packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-        psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-        psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-        movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
-
-        mov     al,2                    ; Yctr
-        jmp     short .Yloop_1st
-        alignx  16,7
-
-.Yloop_2nd:
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-        alignx  16,7
-
-.Yloop_1st:
-        movdqa  xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
-
-        pcmpeqw xmm6,xmm6
-        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-        pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
-        psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
-
-        movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
-        movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
-        movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
-
-        paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-        paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-        packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-        packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-        paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-        paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-        packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-        packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-        paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-        paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-        packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-        packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-        punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-        movdqa    xmmG,xmmA
-        movdqa    xmmH,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-        punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-        psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-        psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-        movdqa    xmmC,xmmD
-        movdqa    xmmB,xmmD
-        punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-        punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-        psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-        movdqa    xmmF,xmmE
-        punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-        punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-        pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-        movdqa    xmmB,xmmE
-        punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-        punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-        punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-        pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-        movdqa    xmmB,xmmF
-        punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-        punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-        punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-        punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-        punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-        punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
-        cmp     ecx, byte 2*SIZEOF_XMMWORD
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmF
-        sub     ecx, byte 2*SIZEOF_XMMWORD
-        jmp     short .column_st15
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD
-.column_st15:
-        ; Store the lower 8 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_MMWORD
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_MMWORD
-        sub     ecx, byte SIZEOF_MMWORD
-        psrldq  xmmA, SIZEOF_MMWORD
-.column_st7:
-        ; Store the lower 4 bytes of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_DWORD
-        jb      short .column_st3
-        movd    XMM_DWORD [edi], xmmA
-        add     edi, byte SIZEOF_DWORD
-        sub     ecx, byte SIZEOF_DWORD
-        psrldq  xmmA, SIZEOF_DWORD
-.column_st3:
-        ; Store the lower 2 bytes of eax to the output when it has enough
-        ; space.
-        movd    eax, xmmA
-        cmp     ecx, byte SIZEOF_WORD
-        jb      short .column_st1
-        mov     WORD [edi], ax
-        add     edi, byte SIZEOF_WORD
-        sub     ecx, byte SIZEOF_WORD
-        shr     eax, 16
-.column_st1:
-        ; Store the lower 1 byte of eax to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .endcolumn
-        mov     BYTE [edi], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-        pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-        pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-        pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-        ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-        ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-        ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-        ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-        punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-        punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-        punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-        punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-        movdqa    xmmC,xmmA
-        punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-        punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-        movdqa    xmmG,xmmB
-        punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-        punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-        movdqa    xmmD,xmmA
-        punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-        punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-        movdqa    xmmH,xmmC
-        punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-        punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-        cmp     ecx, byte SIZEOF_XMMWORD
-        jb      short .column_st32
-
-        test    edi, SIZEOF_XMMWORD-1
-        jnz     short .out1
-        ; --(aligned)-------------------
-        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-        jmp     short .out0
-.out1:  ; --(unaligned)-----------------
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-        movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-.out0:
-        add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-        sub     ecx, byte SIZEOF_XMMWORD
-        jz      near .endcolumn
-
-        add     esi, byte SIZEOF_XMMWORD        ; inptr0
-        dec     al                      ; Yctr
-        jnz     near .Yloop_2nd
-
-        add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-        add     edx, byte SIZEOF_XMMWORD        ; inptr2
-        jmp     near .columnloop
-        alignx  16,7
-
-.column_st32:
-        cmp     ecx, byte SIZEOF_XMMWORD/2
-        jb      short .column_st16
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        movdqa  xmmA,xmmC
-        movdqa  xmmD,xmmH
-        sub     ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-        cmp     ecx, byte SIZEOF_XMMWORD/4
-        jb      short .column_st15
-        movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-        add     edi, byte SIZEOF_XMMWORD        ; outptr
-        movdqa  xmmA,xmmD
-        sub     ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
-        ; space.
-        cmp     ecx, byte SIZEOF_XMMWORD/8
-        jb      short .column_st7
-        movq    XMM_MMWORD [edi], xmmA
-        add     edi, byte SIZEOF_XMMWORD/8*4
-        sub     ecx, byte SIZEOF_XMMWORD/8
-        psrldq  xmmA, SIZEOF_XMMWORD/8*4
-.column_st7:
-        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
-        ; space.
-        test    ecx, ecx
-        jz      short .endcolumn
-        movd    XMM_DWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-        sfence          ; flush the write buffer
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b) (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)            (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)     (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)           (b)+20          ; JSAMPARRAY output_buf
-
-        align   16
-        global  EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     eax, POINTER [output_width(ebp)]
-
-        mov     edi, JSAMPIMAGE [input_buf(ebp)]
-        mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-        mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-        mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-        mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-        mov     edi, JSAMPARRAY [output_buf(ebp)]
-        lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-        push    edx                     ; inptr2
-        push    ebx                     ; inptr1
-        push    esi                     ; inptr00
-        mov     ebx,esp
-
-        push    edi                     ; output_buf (outptr0)
-        push    ecx                     ; in_row_group_ctr
-        push    ebx                     ; input_buf
-        push    eax                     ; output_width
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-        add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-        mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-        mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-        call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-        add     esp, byte 7*SIZEOF_DWORD
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdsample-mmx.asm b/media/libjpeg/simd/jdsample-mmx.asm
deleted file mode 100644
index 5e4fa7ae22..0000000000
--- a/media/libjpeg/simd/jdsample-mmx.asm
+++ /dev/null
@@ -1,736 +0,0 @@
-;
-; jdsample.asm - upsampling (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_mmx)
-
-EXTN(jconst_fancy_upsample_mmx):
-
-PW_ONE          times 4 dw  1
-PW_TWO          times 4 dw  2
-PW_THREE        times 4 dw  3
-PW_SEVEN        times 4 dw  7
-PW_EIGHT        times 4 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v1_fancy_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                     ; colctr
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr
-
-        test    eax, SIZEOF_MMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    mm0,mm0                 ; mm0=(all 0's)
-        pcmpeqb mm7,mm7
-        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
-        pand    mm7, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        add     eax, byte SIZEOF_MMWORD-1
-        and     eax, byte -SIZEOF_MMWORD
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        pcmpeqb mm6,mm6
-        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-        pand    mm6, MMWORD [esi+0*SIZEOF_MMWORD]
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        movq    mm6, MMWORD [esi+1*SIZEOF_MMWORD]
-        psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
-
-.upsample:
-        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
-        movq    mm2,mm1
-        movq    mm3,mm1                 ; mm1=( 0 1 2 3 4 5 6 7)
-        psllq   mm2,BYTE_BIT            ; mm2=( - 0 1 2 3 4 5 6)
-        psrlq   mm3,BYTE_BIT            ; mm3=( 1 2 3 4 5 6 7 -)
-
-        por     mm2,mm7                 ; mm2=(-1 0 1 2 3 4 5 6)
-        por     mm3,mm6                 ; mm3=( 1 2 3 4 5 6 7 8)
-
-        movq    mm7,mm1
-        psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
-
-        movq      mm4,mm1
-        punpcklbw mm1,mm0               ; mm1=( 0 1 2 3)
-        punpckhbw mm4,mm0               ; mm4=( 4 5 6 7)
-        movq      mm5,mm2
-        punpcklbw mm2,mm0               ; mm2=(-1 0 1 2)
-        punpckhbw mm5,mm0               ; mm5=( 3 4 5 6)
-        movq      mm6,mm3
-        punpcklbw mm3,mm0               ; mm3=( 1 2 3 4)
-        punpckhbw mm6,mm0               ; mm6=( 5 6 7 8)
-
-        pmullw  mm1,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm2,[GOTOFF(ebx,PW_ONE)]
-        paddw   mm5,[GOTOFF(ebx,PW_ONE)]
-        paddw   mm3,[GOTOFF(ebx,PW_TWO)]
-        paddw   mm6,[GOTOFF(ebx,PW_TWO)]
-
-        paddw   mm2,mm1
-        paddw   mm5,mm4
-        psrlw   mm2,2                   ; mm2=OutLE=( 0  2  4  6)
-        psrlw   mm5,2                   ; mm5=OutHE=( 8 10 12 14)
-        paddw   mm3,mm1
-        paddw   mm6,mm4
-        psrlw   mm3,2                   ; mm3=OutLO=( 1  3  5  7)
-        psrlw   mm6,2                   ; mm6=OutHO=( 9 11 13 15)
-
-        psllw   mm3,BYTE_BIT
-        psllw   mm6,BYTE_BIT
-        por     mm2,mm3                 ; mm2=OutL=( 0  1  2  3  4  5  6  7)
-        por     mm5,mm6                 ; mm5=OutH=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm5
-
-        sub     eax, byte SIZEOF_MMWORD
-        add     esi, byte 1*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 2*SIZEOF_MMWORD       ; outptr
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
-;                                JDIMENSION downsampled_width,
-;                                JSAMPARRAY input_data,
-;                                JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
-
-EXTN(jsimd_h2v2_fancy_upsample_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     edx,eax                         ; edx = original ebp
-        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(edx)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                                     ; colctr
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    eax, SIZEOF_MMWORD-1
-        jz      short .skip
-        push    edx
-        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     edx
-.skip:
-        ; -- process the first column block
-
-        movq    mm0, MMWORD [ebx+0*SIZEOF_MMWORD]       ; mm0=row[ 0][0]
-        movq    mm1, MMWORD [ecx+0*SIZEOF_MMWORD]       ; mm1=row[-1][0]
-        movq    mm2, MMWORD [esi+0*SIZEOF_MMWORD]       ; mm2=row[+1][0]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      mm3,mm3               ; mm3=(all 0's)
-        movq      mm4,mm0
-        punpcklbw mm0,mm3               ; mm0=row[ 0][0]( 0 1 2 3)
-        punpckhbw mm4,mm3               ; mm4=row[ 0][0]( 4 5 6 7)
-        movq      mm5,mm1
-        punpcklbw mm1,mm3               ; mm1=row[-1][0]( 0 1 2 3)
-        punpckhbw mm5,mm3               ; mm5=row[-1][0]( 4 5 6 7)
-        movq      mm6,mm2
-        punpcklbw mm2,mm3               ; mm2=row[+1][0]( 0 1 2 3)
-        punpckhbw mm6,mm3               ; mm6=row[+1][0]( 4 5 6 7)
-
-        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-
-        pcmpeqb mm7,mm7
-        psrlq   mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
-
-        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
-        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
-        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
-        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
-
-        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1       ; temporarily save
-        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5       ; the intermediate data
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm6
-
-        pand    mm1,mm7                 ; mm1=( 0 - - -)
-        pand    mm2,mm7                 ; mm2=( 0 - - -)
-
-        movq    MMWORD [wk(0)], mm1
-        movq    MMWORD [wk(1)], mm2
-
-        poppic  ebx
-
-        add     eax, byte SIZEOF_MMWORD-1
-        and     eax, byte -SIZEOF_MMWORD
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pcmpeqb mm1,mm1
-        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
-        movq    mm2,mm1
-
-        pand    mm1, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm1=( - - - 7)
-        pand    mm2, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm2=( - - - 7)
-
-        movq    MMWORD [wk(2)], mm1
-        movq    MMWORD [wk(3)], mm2
-
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        ; -- process the next column block
-
-        movq    mm0, MMWORD [ebx+1*SIZEOF_MMWORD]       ; mm0=row[ 0][1]
-        movq    mm1, MMWORD [ecx+1*SIZEOF_MMWORD]       ; mm1=row[-1][1]
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]       ; mm2=row[+1][1]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      mm3,mm3               ; mm3=(all 0's)
-        movq      mm4,mm0
-        punpcklbw mm0,mm3               ; mm0=row[ 0][1]( 0 1 2 3)
-        punpckhbw mm4,mm3               ; mm4=row[ 0][1]( 4 5 6 7)
-        movq      mm5,mm1
-        punpcklbw mm1,mm3               ; mm1=row[-1][1]( 0 1 2 3)
-        punpckhbw mm5,mm3               ; mm5=row[-1][1]( 4 5 6 7)
-        movq      mm6,mm2
-        punpcklbw mm2,mm3               ; mm2=row[+1][1]( 0 1 2 3)
-        punpckhbw mm6,mm3               ; mm6=row[+1][1]( 4 5 6 7)
-
-        pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-
-        paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
-        paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
-        paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
-        paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
-
-        movq    MMWORD [edx+2*SIZEOF_MMWORD], mm1       ; temporarily save
-        movq    MMWORD [edx+3*SIZEOF_MMWORD], mm5       ; the intermediate data
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm6
-
-        psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
-        psllq   mm2,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
-
-        movq    MMWORD [wk(2)], mm1
-        movq    MMWORD [wk(3)], mm2
-
-.upsample:
-        ; -- process the upper row
-
-        movq    mm7, MMWORD [edx+0*SIZEOF_MMWORD]       ; mm7=Int0L=( 0 1 2 3)
-        movq    mm3, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm3=Int0H=( 4 5 6 7)
-
-        movq    mm0,mm7
-        movq    mm4,mm3
-        psrlq   mm0,2*BYTE_BIT                  ; mm0=( 1 2 3 -)
-        psllq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
-        movq    mm5,mm7
-        movq    mm6,mm3
-        psrlq   mm5,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
-        psllq   mm6,2*BYTE_BIT                  ; mm6=( - 4 5 6)
-
-        por     mm0,mm4                         ; mm0=( 1 2 3 4)
-        por     mm5,mm6                         ; mm5=( 3 4 5 6)
-
-        movq    mm1,mm7
-        movq    mm2,mm3
-        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
-        psrlq   mm2,2*BYTE_BIT                  ; mm2=( 5 6 7 -)
-        movq    mm4,mm3
-        psrlq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
-
-        por     mm1, MMWORD [wk(0)]             ; mm1=(-1 0 1 2)
-        por     mm2, MMWORD [wk(2)]             ; mm2=( 5 6 7 8)
-
-        movq    MMWORD [wk(0)], mm4
-
-        pmullw  mm7,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm3,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm5,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm0,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   mm2,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   mm1,mm7
-        paddw   mm5,mm3
-        psrlw   mm1,4                   ; mm1=Out0LE=( 0  2  4  6)
-        psrlw   mm5,4                   ; mm5=Out0HE=( 8 10 12 14)
-        paddw   mm0,mm7
-        paddw   mm2,mm3
-        psrlw   mm0,4                   ; mm0=Out0LO=( 1  3  5  7)
-        psrlw   mm2,4                   ; mm2=Out0HO=( 9 11 13 15)
-
-        psllw   mm0,BYTE_BIT
-        psllw   mm2,BYTE_BIT
-        por     mm1,mm0                 ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
-        por     mm5,mm2                 ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5
-
-        ; -- process the lower row
-
-        movq    mm6, MMWORD [edi+0*SIZEOF_MMWORD]       ; mm6=Int1L=( 0 1 2 3)
-        movq    mm4, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm4=Int1H=( 4 5 6 7)
-
-        movq    mm7,mm6
-        movq    mm3,mm4
-        psrlq   mm7,2*BYTE_BIT                  ; mm7=( 1 2 3 -)
-        psllq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
-        movq    mm0,mm6
-        movq    mm2,mm4
-        psrlq   mm0,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
-        psllq   mm2,2*BYTE_BIT                  ; mm2=( - 4 5 6)
-
-        por     mm7,mm3                         ; mm7=( 1 2 3 4)
-        por     mm0,mm2                         ; mm0=( 3 4 5 6)
-
-        movq    mm1,mm6
-        movq    mm5,mm4
-        psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
-        psrlq   mm5,2*BYTE_BIT                  ; mm5=( 5 6 7 -)
-        movq    mm3,mm4
-        psrlq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
-
-        por     mm1, MMWORD [wk(1)]             ; mm1=(-1 0 1 2)
-        por     mm5, MMWORD [wk(3)]             ; mm5=( 5 6 7 8)
-
-        movq    MMWORD [wk(1)], mm3
-
-        pmullw  mm6,[GOTOFF(ebx,PW_THREE)]
-        pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm0,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   mm7,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   mm5,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   mm1,mm6
-        paddw   mm0,mm4
-        psrlw   mm1,4                   ; mm1=Out1LE=( 0  2  4  6)
-        psrlw   mm0,4                   ; mm0=Out1HE=( 8 10 12 14)
-        paddw   mm7,mm6
-        paddw   mm5,mm4
-        psrlw   mm7,4                   ; mm7=Out1LO=( 1  3  5  7)
-        psrlw   mm5,4                   ; mm5=Out1HO=( 9 11 13 15)
-
-        psllw   mm7,BYTE_BIT
-        psllw   mm5,BYTE_BIT
-        por     mm1,mm7                 ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
-        por     mm0,mm5                 ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm0
-
-        poppic  ebx
-
-        sub     eax, byte SIZEOF_MMWORD
-        add     ecx, byte 1*SIZEOF_MMWORD       ; inptr1(above)
-        add     ebx, byte 1*SIZEOF_MMWORD       ; inptr0
-        add     esi, byte 1*SIZEOF_MMWORD       ; inptr1(below)
-        add     edx, byte 2*SIZEOF_MMWORD       ; outptr0
-        add     edi, byte 2*SIZEOF_MMWORD       ; outptr1
-        cmp     eax, byte SIZEOF_MMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     ecx
-        pop     eax
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_mmx)
-
-EXTN(jsimd_h2v1_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_MMWORD)-1
-        and     edx, byte -(2*SIZEOF_MMWORD)
-        jz      short .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        mov     eax,edx                         ; colctr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        movq      mm1,mm0
-        punpcklbw mm0,mm0
-        punpckhbw mm1,mm1
-
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq      mm3,mm2
-        punpcklbw mm2,mm2
-        punpckhbw mm3,mm3
-
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     edi, byte 4*SIZEOF_MMWORD       ; outptr
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
-;                          JDIMENSION output_width,
-;                          JSAMPARRAY input_data,
-;                          JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_mmx)
-
-EXTN(jsimd_h2v2_upsample_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_MMWORD)-1
-        and     edx, byte -(2*SIZEOF_MMWORD)
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]                     ; inptr
-        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     eax,edx                                 ; colctr
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
-
-        movq      mm1,mm0
-        punpcklbw mm0,mm0
-        punpckhbw mm1,mm1
-
-        movq    MMWORD [ebx+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [ebx+1*SIZEOF_MMWORD], mm1
-        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
-        movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
-
-        movq      mm3,mm2
-        punpcklbw mm2,mm2
-        punpckhbw mm3,mm3
-
-        movq    MMWORD [ebx+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [ebx+3*SIZEOF_MMWORD], mm3
-        movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
-        movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
-
-        sub     eax, byte 2*SIZEOF_MMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
-        add     ebx, byte 4*SIZEOF_MMWORD       ; outptr0
-        add     edi, byte 4*SIZEOF_MMWORD       ; outptr1
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      short .rowloop
-
-        emms            ; empty MMX state
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdsample-sse2-64.asm b/media/libjpeg/simd/jdsample-sse2-64.asm
deleted file mode 100644
index 1faaed648a..0000000000
--- a/media/libjpeg/simd/jdsample-sse2-64.asm
+++ /dev/null
@@ -1,670 +0,0 @@
-;
-; jdsample.asm - upsampling (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE          times 8 dw  1
-PW_TWO          times 8 dw  2
-PW_THREE        times 8 dw  3
-PW_SEVEN        times 8 dw  7
-PW_EIGHT        times 8 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     eax, r11d  ; colctr
-        test    rax,rax
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rax                     ; colctr
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]     ; inptr
-        mov     rdi, JSAMPROW [rdi]     ; outptr
-
-        test    rax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    xmm0,xmm0               ; xmm0=(all 0's)
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-1)
-        pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        add     rax, byte SIZEOF_XMMWORD-1
-        and     rax, byte -SIZEOF_XMMWORD
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-
-.columnloop_last:
-        pcmpeqb xmm6,xmm6
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-        pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        jmp     short .upsample
-
-.columnloop:
-        movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2,xmm1
-        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
-        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
-        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
-
-        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
-        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
-
-        movdqa  xmm7,xmm1
-        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
-        movdqa    xmm4,xmm1
-        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm2
-        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
-        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
-        movdqa    xmm6,xmm3
-        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
-        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
-
-        pmullw  xmm1,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-        paddw   xmm2,[rel PW_ONE]
-        paddw   xmm5,[rel PW_ONE]
-        paddw   xmm3,[rel PW_TWO]
-        paddw   xmm6,[rel PW_TWO]
-
-        paddw   xmm2,xmm1
-        paddw   xmm5,xmm4
-        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-        paddw   xmm3,xmm1
-        paddw   xmm6,xmm4
-        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm3,BYTE_BIT
-        psllw   xmm6,BYTE_BIT
-        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
-
-        sub     rax, byte SIZEOF_XMMWORD
-        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     rsi
-        pop     rdi
-        pop     rax
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rcx                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION downsampled_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          4
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-        push    rbx
-
-        mov     eax, r11d  ; colctr
-        test    rax,rax
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rax                                     ; colctr
-        push    rcx
-        push    rdi
-        push    rsi
-
-        mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    rax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        push    rdx
-        mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     rdx
-.skip:
-        ; -- process the first column block
-
-        movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
-        movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
-        movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-2)
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
-
-        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
-        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
-
-        movdqa  XMMWORD [wk(0)], xmm1
-        movdqa  XMMWORD [wk(1)], xmm2
-
-        add     rax, byte SIZEOF_XMMWORD-1
-        and     rax, byte -SIZEOF_XMMWORD
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pcmpeqb xmm1,xmm1
-        pslldq  xmm1,(SIZEOF_XMMWORD-2)
-        movdqa  xmm2,xmm1
-
-        pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-        pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
-        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
-
-        jmp     near .upsample
-
-.columnloop:
-        ; -- process the next column block
-
-        movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
-        movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
-
-        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
-        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
-
-        movdqa  XMMWORD [wk(2)], xmm1
-        movdqa  XMMWORD [wk(3)], xmm2
-
-.upsample:
-        ; -- process the upper row
-
-        movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
-        movdqa  xmm5,xmm7
-        movdqa  xmm6,xmm3
-        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
-
-        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
-        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm2,xmm3
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm4,xmm3
-        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(0)], xmm4
-
-        pmullw  xmm7,[rel PW_THREE]
-        pmullw  xmm3,[rel PW_THREE]
-        paddw   xmm1,[rel PW_EIGHT]
-        paddw   xmm5,[rel PW_EIGHT]
-        paddw   xmm0,[rel PW_SEVEN]
-        paddw   xmm2,[rel PW_SEVEN]
-
-        paddw   xmm1,xmm7
-        paddw   xmm5,xmm3
-        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm3
-        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm0,BYTE_BIT
-        psllw   xmm2,BYTE_BIT
-        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
-
-        ; -- process the lower row
-
-        movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
-        movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
-        movdqa  xmm0,xmm6
-        movdqa  xmm2,xmm4
-        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
-
-        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
-        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm4
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm3,xmm4
-        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(1)], xmm3
-
-        pmullw  xmm6,[rel PW_THREE]
-        pmullw  xmm4,[rel PW_THREE]
-        paddw   xmm1,[rel PW_EIGHT]
-        paddw   xmm0,[rel PW_EIGHT]
-        paddw   xmm7,[rel PW_SEVEN]
-        paddw   xmm5,[rel PW_SEVEN]
-
-        paddw   xmm1,xmm6
-        paddw   xmm0,xmm4
-        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm7,xmm6
-        paddw   xmm5,xmm4
-        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm7,BYTE_BIT
-        psllw   xmm5,BYTE_BIT
-        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
-
-        sub     rax, byte SIZEOF_XMMWORD
-        add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
-        add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
-        add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
-        add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
-        add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
-        cmp     rax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    rax,rax
-        jnz     near .columnloop_last
-
-        pop     rsi
-        pop     rdi
-        pop     rcx
-        pop     rax
-
-        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     rcx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov     edx, r11d
-        add     rdx, byte (2*SIZEOF_XMMWORD)-1
-        and     rdx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      short .return
-
-        mov     rsi, r12 ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]             ; inptr
-        mov     rdi, JSAMPROW [rdi]             ; outptr
-        mov     rax,rdx                         ; colctr
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
-        jmp     short .columnloop
-
-.nextrow:
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
-        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     rcx                             ; rowctr
-        jg      short .rowloop
-
-.return:
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-; r10 = int max_v_samp_factor
-; r11 = JDIMENSION output_width
-; r12 = JSAMPARRAY input_data
-; r13 = JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        mov     edx, r11d
-        add     rdx, byte (2*SIZEOF_XMMWORD)-1
-        and     rdx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     rcx, r10        ; rowctr
-        test    rcx,rcx
-        jz      near .return
-
-        mov     rsi, r12        ; input_data
-        mov     rdi, r13
-        mov     rdi, JSAMPARRAY [rdi]                   ; output_data
-.rowloop:
-        push    rdi
-        push    rsi
-
-        mov     rsi, JSAMPROW [rsi]                     ; inptr
-        mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     rax,rdx                                 ; colctr
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
-        movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     rax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
-        add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
-        jmp     short .columnloop
-
-.nextrow:
-        pop     rsi
-        pop     rdi
-
-        add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     rcx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jdsample-sse2.asm b/media/libjpeg/simd/jdsample-sse2.asm
deleted file mode 100644
index 1d0059e803..0000000000
--- a/media/libjpeg/simd/jdsample-sse2.asm
+++ /dev/null
@@ -1,728 +0,0 @@
-;
-; jdsample.asm - upsampling (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fancy_upsample_sse2)
-
-EXTN(jconst_fancy_upsample_sse2):
-
-PW_ONE          times 8 dw  1
-PW_TWO          times 8 dw  2
-PW_THREE        times 8 dw  3
-PW_SEVEN        times 8 dw  7
-PW_EIGHT        times 8 dw  8
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
-;
-; The upsampling algorithm is linear interpolation between pixel centers,
-; also known as a "triangle filter".  This is a good compromise between
-; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
-; of the way between input pixel centers.
-;
-; GLOBAL(void)
-; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v1_fancy_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                     ; colctr
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]     ; inptr
-        mov     edi, JSAMPROW [edi]     ; outptr
-
-        test    eax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-.skip:
-        pxor    xmm0,xmm0               ; xmm0=(all 0's)
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-1)
-        pand    xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        add     eax, byte SIZEOF_XMMWORD-1
-        and     eax, byte -SIZEOF_XMMWORD
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        pcmpeqb xmm6,xmm6
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-        pand    xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        jmp     short .upsample
-        alignx  16,7
-
-.columnloop:
-        movdqa  xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
-        pslldq  xmm6,(SIZEOF_XMMWORD-1)
-
-.upsample:
-        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
-        movdqa  xmm2,xmm1
-        movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
-        pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
-        psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
-
-        por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
-        por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
-
-        movdqa  xmm7,xmm1
-        psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
-
-        movdqa    xmm4,xmm1
-        punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm2
-        punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
-        punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
-        movdqa    xmm6,xmm3
-        punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
-        punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
-
-        pmullw  xmm1,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm2,[GOTOFF(ebx,PW_ONE)]
-        paddw   xmm5,[GOTOFF(ebx,PW_ONE)]
-        paddw   xmm3,[GOTOFF(ebx,PW_TWO)]
-        paddw   xmm6,[GOTOFF(ebx,PW_TWO)]
-
-        paddw   xmm2,xmm1
-        paddw   xmm5,xmm4
-        psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
-        paddw   xmm3,xmm1
-        paddw   xmm6,xmm4
-        psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm3,BYTE_BIT
-        psllw   xmm6,BYTE_BIT
-        por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
-
-        sub     eax, byte SIZEOF_XMMWORD
-        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     eax
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
-; Again a triangle filter; see comments for h2v1 case, above.
-;
-; GLOBAL(void)
-; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
-;                                 JDIMENSION downsampled_width,
-;                                 JSAMPARRAY input_data,
-;                                 JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          4
-%define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
-
-EXTN(jsimd_h2v2_fancy_upsample_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic eax             ; make a room for GOT address
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx                     ; get GOT address
-        movpic  POINTER [gotptr], ebx   ; save GOT address
-
-        mov     edx,eax                         ; edx = original ebp
-        mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
-        test    eax,eax
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(edx)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(edx)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    eax                                     ; colctr
-        push    ecx
-        push    edi
-        push    esi
-
-        mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
-        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-
-        test    eax, SIZEOF_XMMWORD-1
-        jz      short .skip
-        push    edx
-        mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
-        mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
-        mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
-        pop     edx
-.skip:
-        ; -- process the first column block
-
-        movdqa  xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
-        movdqa  xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
-        movdqa  xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-
-        pcmpeqb xmm7,xmm7
-        psrldq  xmm7,(SIZEOF_XMMWORD-2)
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
-
-        pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
-        pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
-
-        movdqa  XMMWORD [wk(0)], xmm1
-        movdqa  XMMWORD [wk(1)], xmm2
-
-        poppic  ebx
-
-        add     eax, byte SIZEOF_XMMWORD-1
-        and     eax, byte -SIZEOF_XMMWORD
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      short .columnloop
-        alignx  16,7
-
-.columnloop_last:
-        ; -- process the last column block
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pcmpeqb xmm1,xmm1
-        pslldq  xmm1,(SIZEOF_XMMWORD-2)
-        movdqa  xmm2,xmm1
-
-        pand    xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
-        pand    xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-        movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
-        movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
-
-        jmp     near .upsample
-        alignx  16,7
-
-.columnloop:
-        ; -- process the next column block
-
-        movdqa  xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
-        movdqa  xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
-
-        pushpic ebx
-        movpic  ebx, POINTER [gotptr]   ; load GOT address
-
-        pxor      xmm3,xmm3             ; xmm3=(all 0's)
-        movdqa    xmm4,xmm0
-        punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm5,xmm1
-        punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
-        movdqa    xmm6,xmm2
-        punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
-        punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
-
-        pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-
-        paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
-        paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
-        paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
-
-        movdqa  XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
-        movdqa  XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
-
-        pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
-        pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
-
-        movdqa  XMMWORD [wk(2)], xmm1
-        movdqa  XMMWORD [wk(3)], xmm2
-
-.upsample:
-        ; -- process the upper row
-
-        movdqa  xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
-        movdqa  xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
-        movdqa  xmm5,xmm7
-        movdqa  xmm6,xmm3
-        psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
-
-        por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
-        por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm2,xmm3
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm4,xmm3
-        psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(0)], xmm4
-
-        pmullw  xmm7,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm3,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm5,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm0,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   xmm2,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   xmm1,xmm7
-        paddw   xmm5,xmm3
-        psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm0,xmm7
-        paddw   xmm2,xmm3
-        psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm0,BYTE_BIT
-        psllw   xmm2,BYTE_BIT
-        por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
-        por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
-
-        ; -- process the lower row
-
-        movdqa  xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
-        movdqa  xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
-
-        movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
-        movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
-        psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
-        pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
-        movdqa  xmm0,xmm6
-        movdqa  xmm2,xmm4
-        psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
-        pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
-
-        por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
-        por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
-
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm4
-        pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
-        psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
-        movdqa  xmm3,xmm4
-        psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
-
-        por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
-        por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
-
-        movdqa  XMMWORD [wk(1)], xmm3
-
-        pmullw  xmm6,[GOTOFF(ebx,PW_THREE)]
-        pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
-        paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm0,[GOTOFF(ebx,PW_EIGHT)]
-        paddw   xmm7,[GOTOFF(ebx,PW_SEVEN)]
-        paddw   xmm5,[GOTOFF(ebx,PW_SEVEN)]
-
-        paddw   xmm1,xmm6
-        paddw   xmm0,xmm4
-        psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
-        psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
-        paddw   xmm7,xmm6
-        paddw   xmm5,xmm4
-        psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
-        psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
-
-        psllw   xmm7,BYTE_BIT
-        psllw   xmm5,BYTE_BIT
-        por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
-        por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
-
-        poppic  ebx
-
-        sub     eax, byte SIZEOF_XMMWORD
-        add     ecx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
-        add     ebx, byte 1*SIZEOF_XMMWORD      ; inptr0
-        add     esi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
-        add     edx, byte 2*SIZEOF_XMMWORD      ; outptr0
-        add     edi, byte 2*SIZEOF_XMMWORD      ; outptr1
-        cmp     eax, byte SIZEOF_XMMWORD
-        ja      near .columnloop
-        test    eax,eax
-        jnz     near .columnloop_last
-
-        pop     esi
-        pop     edi
-        pop     ecx
-        pop     eax
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      near .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v1_upsample_sse2)
-
-EXTN(jsimd_h2v1_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_XMMWORD)-1
-        and     edx, byte -(2*SIZEOF_XMMWORD)
-        jz      short .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      short .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]             ; inptr
-        mov     edi, JSAMPROW [edi]             ; outptr
-        mov     eax,edx                         ; colctr
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte SIZEOF_JSAMPROW       ; input_data
-        add     edi, byte SIZEOF_JSAMPROW       ; output_data
-        dec     ecx                             ; rowctr
-        jg      short .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
-; It's still a box filter.
-;
-; GLOBAL(void)
-; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
-;                           JDIMENSION output_width,
-;                           JSAMPARRAY input_data,
-;                           JSAMPARRAY *output_data_ptr);
-;
-
-%define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
-%define output_width(b)         (b)+12          ; JDIMENSION output_width
-%define input_data(b)           (b)+16          ; JSAMPARRAY input_data
-%define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
-
-        align   16
-        global  EXTN(jsimd_h2v2_upsample_sse2)
-
-EXTN(jsimd_h2v2_upsample_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     edx, JDIMENSION [output_width(ebp)]
-        add     edx, byte (2*SIZEOF_XMMWORD)-1
-        and     edx, byte -(2*SIZEOF_XMMWORD)
-        jz      near .return
-
-        mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
-        test    ecx,ecx
-        jz      near .return
-
-        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
-        mov     edi, POINTER [output_data_ptr(ebp)]
-        mov     edi, JSAMPARRAY [edi]                   ; output_data
-        alignx  16,7
-.rowloop:
-        push    edi
-        push    esi
-
-        mov     esi, JSAMPROW [esi]                     ; inptr
-        mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
-        mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
-        mov     eax,edx                                 ; colctr
-        alignx  16,7
-.columnloop:
-
-        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
-
-        movdqa    xmm1,xmm0
-        punpcklbw xmm0,xmm0
-        punpckhbw xmm1,xmm1
-
-        movdqa  XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
-        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
-        movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
-
-        movdqa    xmm3,xmm2
-        punpcklbw xmm2,xmm2
-        punpckhbw xmm3,xmm3
-
-        movdqa  XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
-        movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
-        movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
-
-        sub     eax, byte 2*SIZEOF_XMMWORD
-        jz      short .nextrow
-
-        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
-        add     ebx, byte 4*SIZEOF_XMMWORD      ; outptr0
-        add     edi, byte 4*SIZEOF_XMMWORD      ; outptr1
-        jmp     short .columnloop
-        alignx  16,7
-
-.nextrow:
-        pop     esi
-        pop     edi
-
-        add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
-        add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
-        sub     ecx, byte 2                     ; rowctr
-        jg      short .rowloop
-
-.return:
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctflt-3dn.asm b/media/libjpeg/simd/jfdctflt-3dn.asm
deleted file mode 100644
index 219161819a..0000000000
--- a/media/libjpeg/simd/jfdctflt-3dn.asm
+++ /dev/null
@@ -1,319 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (3DNow!)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_3dnow)
-
-EXTN(jconst_fdct_float_3dnow):
-
-PD_0_382        times 2 dd  0.382683432365089771728460
-PD_0_707        times 2 dd  0.707106781186547524400844
-PD_0_541        times 2 dd  0.541196100146196984399723
-PD_1_306        times 2 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_3dnow (FAST_FLOAT *data)
-;
-
-%define data(b)         (b)+8           ; FAST_FLOAT *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_3dnow)
-
-EXTN(jsimd_fdct_float_3dnow):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
-
-        movq      mm4,mm0               ; transpose coefficients
-        punpckldq mm0,mm1               ; mm0=(00 10)=data0
-        punpckhdq mm4,mm1               ; mm4=(01 11)=data1
-        movq      mm5,mm2               ; transpose coefficients
-        punpckldq mm2,mm3               ; mm2=(06 16)=data6
-        punpckhdq mm5,mm3               ; mm5=(07 17)=data7
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
-        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
-        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
-        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
-
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
-
-        movq      mm4,mm1               ; transpose coefficients
-        punpckldq mm1,mm3               ; mm1=(02 12)=data2
-        punpckhdq mm4,mm3               ; mm4=(03 13)=data3
-        movq      mm0,mm2               ; transpose coefficients
-        punpckldq mm2,mm5               ; mm2=(04 14)=data4
-        punpckhdq mm0,mm5               ; mm0=(05 15)=data5
-
-        movq    mm3,mm4
-        movq    mm5,mm1
-        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
-        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
-        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
-        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm2,mm7
-        movq    mm0,mm6
-        pfsub   mm7,mm4                 ; mm7=tmp13
-        pfsub   mm6,mm1                 ; mm6=tmp12
-        pfadd   mm2,mm4                 ; mm2=tmp10
-        pfadd   mm0,mm1                 ; mm0=tmp11
-
-        pfadd   mm6,mm7
-        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-        movq    mm4,mm2
-        movq    mm1,mm7
-        pfsub   mm2,mm0                 ; mm2=data4
-        pfsub   mm7,mm6                 ; mm7=data6
-        pfadd   mm4,mm0                 ; mm4=data0
-        pfadd   mm1,mm6                 ; mm1=data2
-
-        movq    MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
-
-        pfadd   mm3,mm5                 ; mm3=tmp10
-        pfadd   mm5,mm0                 ; mm5=tmp11
-        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
-
-        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-        movq    mm2,mm3                 ; mm2=tmp10
-        pfsub   mm3,mm0
-        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-        pfadd   mm2,mm3                 ; mm2=z2
-        pfadd   mm0,mm3                 ; mm0=z4
-
-        movq    mm7,mm6
-        pfsub   mm6,mm5                 ; mm6=z13
-        pfadd   mm7,mm5                 ; mm7=z11
-
-        movq    mm4,mm6
-        movq    mm1,mm7
-        pfsub   mm6,mm2                 ; mm6=data3
-        pfsub   mm7,mm0                 ; mm7=data7
-        pfadd   mm4,mm2                 ; mm4=data5
-        pfadd   mm1,mm0                 ; mm1=data1
-
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        add     edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
-
-        movq      mm4,mm0               ; transpose coefficients
-        punpckldq mm0,mm1               ; mm0=(00 01)=data0
-        punpckhdq mm4,mm1               ; mm4=(10 11)=data1
-        movq      mm5,mm2               ; transpose coefficients
-        punpckldq mm2,mm3               ; mm2=(60 61)=data6
-        punpckhdq mm5,mm3               ; mm5=(70 71)=data7
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm2                 ; mm4=data1-data6=tmp6
-        pfsub   mm0,mm5                 ; mm0=data0-data7=tmp7
-        pfadd   mm6,mm2                 ; mm6=data1+data6=tmp1
-        pfadd   mm7,mm5                 ; mm7=data0+data7=tmp0
-
-        movq    mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=tmp7
-
-        movq      mm4,mm1               ; transpose coefficients
-        punpckldq mm1,mm3               ; mm1=(20 21)=data2
-        punpckhdq mm4,mm3               ; mm4=(30 31)=data3
-        movq      mm0,mm2               ; transpose coefficients
-        punpckldq mm2,mm5               ; mm2=(40 41)=data4
-        punpckhdq mm0,mm5               ; mm0=(50 51)=data5
-
-        movq    mm3,mm4
-        movq    mm5,mm1
-        pfadd   mm4,mm2                 ; mm4=data3+data4=tmp3
-        pfadd   mm1,mm0                 ; mm1=data2+data5=tmp2
-        pfsub   mm3,mm2                 ; mm3=data3-data4=tmp4
-        pfsub   mm5,mm0                 ; mm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm2,mm7
-        movq    mm0,mm6
-        pfsub   mm7,mm4                 ; mm7=tmp13
-        pfsub   mm6,mm1                 ; mm6=tmp12
-        pfadd   mm2,mm4                 ; mm2=tmp10
-        pfadd   mm0,mm1                 ; mm0=tmp11
-
-        pfadd   mm6,mm7
-        pfmul   mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
-
-        movq    mm4,mm2
-        movq    mm1,mm7
-        pfsub   mm2,mm0                 ; mm2=data4
-        pfsub   mm7,mm6                 ; mm7=data6
-        pfadd   mm4,mm0                 ; mm4=data0
-        pfadd   mm1,mm6                 ; mm1=data2
-
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [wk(0)]     ; mm0=tmp6
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp7
-
-        pfadd   mm3,mm5                 ; mm3=tmp10
-        pfadd   mm5,mm0                 ; mm5=tmp11
-        pfadd   mm0,mm6                 ; mm0=tmp12, mm6=tmp7
-
-        pfmul   mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
-
-        movq    mm2,mm3                 ; mm2=tmp10
-        pfsub   mm3,mm0
-        pfmul   mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
-        pfmul   mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
-        pfmul   mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
-        pfadd   mm2,mm3                 ; mm2=z2
-        pfadd   mm0,mm3                 ; mm0=z4
-
-        movq    mm7,mm6
-        pfsub   mm6,mm5                 ; mm6=z13
-        pfadd   mm7,mm5                 ; mm7=z11
-
-        movq    mm4,mm6
-        movq    mm1,mm7
-        pfsub   mm6,mm2                 ; mm6=data3
-        pfsub   mm7,mm0                 ; mm7=data7
-        pfadd   mm4,mm2                 ; mm4=data5
-        pfadd   mm1,mm0                 ; mm1=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
-
-        add     edx, byte 2*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .columnloop
-
-        femms           ; empty MMX/3DNow! state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctflt-sse-64.asm b/media/libjpeg/simd/jfdctflt-sse-64.asm
deleted file mode 100644
index 4b64ea4bb5..0000000000
--- a/media/libjpeg/simd/jfdctflt-sse-64.asm
+++ /dev/null
@@ -1,357 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (64-bit SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382        times 4 dd  0.382683432365089771728460
-PD_0_707        times 4 dd  0.707106781186547524400844
-PD_0_541        times 4 dd  0.541196100146196984399723
-PD_1_306        times 4 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-; r10 = FAST_FLOAT *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (FAST_FLOAT *)
-        mov     rcx, DCTSIZE/4
-.rowloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
-        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
-        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
-        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
-        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
-        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     rdx, r10        ; (FAST_FLOAT *)
-        mov     rcx, DCTSIZE/4
-.columnloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
-        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
-        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
-        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
-        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[rel PD_0_707] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[rel PD_0_707] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[rel PD_0_382] ; xmm2=z5
-        mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     rdx, byte 4*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     near .columnloop
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctflt-sse.asm b/media/libjpeg/simd/jfdctflt-sse.asm
deleted file mode 100644
index e7ede26c0c..0000000000
--- a/media/libjpeg/simd/jfdctflt-sse.asm
+++ /dev/null
@@ -1,369 +0,0 @@
-;
-; jfdctflt.asm - floating-point FDCT (SSE)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the forward DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_float_sse)
-
-EXTN(jconst_fdct_float_sse):
-
-PD_0_382        times 4 dd  0.382683432365089771728460
-PD_0_707        times 4 dd  0.707106781186547524400844
-PD_0_541        times 4 dd  0.541196100146196984399723
-PD_1_306        times 4 dd  1.306562964876376527856643
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_float_sse (FAST_FLOAT *data)
-;
-
-%define data(b)         (b)+8           ; FAST_FLOAT *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_float_sse)
-
-EXTN(jsimd_fdct_float_sse):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
-        ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
-        unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
-        unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
-        ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
-        unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
-        unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (FAST_FLOAT *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
-        ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
-
-        movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
-        unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
-        unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
-        movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
-        unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
-        unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
-
-        movaps  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
-
-        ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
-        ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
-
-        movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
-        movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
-
-        movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
-        unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
-        unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
-        movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
-        unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
-        unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
-
-        movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
-        unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
-        movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
-        unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
-        unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
-
-        movaps  xmm0,xmm7
-        movaps  xmm5,xmm6
-        subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
-        subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
-        addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
-
-        movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
-        movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
-        movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
-        unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
-        movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
-        unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
-
-        movaps  xmm2,xmm7
-        movaps  xmm3,xmm4
-        addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
-        addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
-        subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movaps  xmm1,xmm5
-        movaps  xmm6,xmm0
-        subps   xmm5,xmm7               ; xmm5=tmp13
-        subps   xmm0,xmm4               ; xmm0=tmp12
-        addps   xmm1,xmm7               ; xmm1=tmp10
-        addps   xmm6,xmm4               ; xmm6=tmp11
-
-        addps   xmm0,xmm5
-        mulps   xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
-
-        movaps  xmm7,xmm1
-        movaps  xmm4,xmm5
-        subps   xmm1,xmm6               ; xmm1=data4
-        subps   xmm5,xmm0               ; xmm5=data6
-        addps   xmm7,xmm6               ; xmm7=data0
-        addps   xmm4,xmm0               ; xmm4=data2
-
-        movaps  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        ; -- Odd part
-
-        movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
-        movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
-
-        addps   xmm2,xmm3               ; xmm2=tmp10
-        addps   xmm3,xmm6               ; xmm3=tmp11
-        addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
-
-        mulps   xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
-
-        movaps  xmm1,xmm2               ; xmm1=tmp10
-        subps   xmm2,xmm6
-        mulps   xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
-        mulps   xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
-        mulps   xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
-        addps   xmm1,xmm2               ; xmm1=z2
-        addps   xmm6,xmm2               ; xmm6=z4
-
-        movaps  xmm5,xmm0
-        subps   xmm0,xmm3               ; xmm0=z13
-        addps   xmm5,xmm3               ; xmm5=z11
-
-        movaps  xmm7,xmm0
-        movaps  xmm4,xmm5
-        subps   xmm0,xmm1               ; xmm0=data3
-        subps   xmm5,xmm6               ; xmm5=data7
-        addps   xmm7,xmm1               ; xmm7=data5
-        addps   xmm4,xmm6               ; xmm4=data1
-
-        movaps  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
-        movaps  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
-
-        add     edx, byte 4*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .columnloop
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctfst-mmx.asm b/media/libjpeg/simd/jfdctfst-mmx.asm
deleted file mode 100644
index eb2eb9c50d..0000000000
--- a/media/libjpeg/simd/jfdctfst-mmx.asm
+++ /dev/null
@@ -1,396 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_mmx)
-
-EXTN(jconst_fdct_ifast_mmx):
-
-PW_F0707        times 4 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 4 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 4 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 4 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_mmx (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_mmx)
-
-EXTN(jsimd_fdct_ifast_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
-        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
-        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
-        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
-        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
-        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
-        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
-        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
-        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        psubw   mm5,mm7                 ; mm5=tmp13
-        psubw   mm0,mm4                 ; mm0=tmp12
-        paddw   mm1,mm7                 ; mm1=tmp10
-        paddw   mm6,mm4                 ; mm6=tmp11
-
-        paddw   mm0,mm5
-        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-        movq    mm7,mm1
-        movq    mm4,mm5
-        psubw   mm1,mm6                 ; mm1=data4
-        psubw   mm5,mm0                 ; mm5=data6
-        paddw   mm7,mm6                 ; mm7=data0
-        paddw   mm4,mm0                 ; mm4=data2
-
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-        ; -- Odd part
-
-        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
-
-        paddw   mm2,mm3                 ; mm2=tmp10
-        paddw   mm3,mm6                 ; mm3=tmp11
-        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
-
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-        movq    mm1,mm2                 ; mm1=tmp10
-        psubw   mm2,mm6
-        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-        paddw   mm1,mm2                 ; mm1=z2
-        paddw   mm6,mm2                 ; mm6=z4
-
-        movq    mm5,mm0
-        psubw   mm0,mm3                 ; mm0=z13
-        paddw   mm5,mm3                 ; mm5=z11
-
-        movq    mm7,mm0
-        movq    mm4,mm5
-        psubw   mm0,mm1                 ; mm0=data3
-        psubw   mm5,mm6                 ; mm5=data7
-        paddw   mm7,mm1                 ; mm7=data5
-        paddw   mm4,mm6                 ; mm4=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
-        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
-        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
-        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
-        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
-        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
-        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
-        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
-        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
-        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        psubw   mm5,mm7                 ; mm5=tmp13
-        psubw   mm0,mm4                 ; mm0=tmp12
-        paddw   mm1,mm7                 ; mm1=tmp10
-        paddw   mm6,mm4                 ; mm6=tmp11
-
-        paddw   mm0,mm5
-        psllw   mm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1
-
-        movq    mm7,mm1
-        movq    mm4,mm5
-        psubw   mm1,mm6                 ; mm1=data4
-        psubw   mm5,mm0                 ; mm5=data6
-        paddw   mm7,mm6                 ; mm7=data0
-        paddw   mm4,mm0                 ; mm4=data2
-
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-
-        ; -- Odd part
-
-        movq    mm6, MMWORD [wk(0)]     ; mm6=tmp6
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp7
-
-        paddw   mm2,mm3                 ; mm2=tmp10
-        paddw   mm3,mm6                 ; mm3=tmp11
-        paddw   mm6,mm0                 ; mm6=tmp12, mm0=tmp7
-
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm6,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   mm3,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3
-
-        movq    mm1,mm2                 ; mm1=tmp10
-        psubw   mm2,mm6
-        pmulhw  mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5
-        pmulhw  mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
-        pmulhw  mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
-        paddw   mm1,mm2                 ; mm1=z2
-        paddw   mm6,mm2                 ; mm6=z4
-
-        movq    mm5,mm0
-        psubw   mm0,mm3                 ; mm0=z13
-        paddw   mm5,mm3                 ; mm5=z11
-
-        movq    mm7,mm0
-        movq    mm4,mm5
-        psubw   mm0,mm1                 ; mm0=data3
-        psubw   mm5,mm6                 ; mm5=data7
-        paddw   mm7,mm1                 ; mm7=data5
-        paddw   mm4,mm6                 ; mm4=data1
-
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
-
-        add     edx, byte 4*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .columnloop
-
-        emms            ; empty MMX state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctfst-sse2-64.asm b/media/libjpeg/simd/jfdctfst-sse2-64.asm
deleted file mode 100644
index 4c96685427..0000000000
--- a/media/libjpeg/simd/jfdctfst-sse2-64.asm
+++ /dev/null
@@ -1,391 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        psubw   xmm3,xmm1               ; xmm3=tmp13
-        psubw   xmm6,xmm7               ; xmm6=tmp12
-        paddw   xmm4,xmm1               ; xmm4=tmp10
-        paddw   xmm0,xmm7               ; xmm0=tmp11
-
-        paddw   xmm6,xmm3
-        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm6,[rel PW_F0707] ; xmm6=z1
-
-        movdqa  xmm1,xmm4
-        movdqa  xmm7,xmm3
-        psubw   xmm4,xmm0               ; xmm4=data4
-        psubw   xmm3,xmm6               ; xmm3=data6
-        paddw   xmm1,xmm0               ; xmm1=data0
-        paddw   xmm7,xmm6               ; xmm7=data2
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
-
-        ; -- Odd part
-
-        paddw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm5,xmm0               ; xmm5=tmp11
-        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z3
-
-        movdqa  xmm4,xmm2               ; xmm4=tmp10
-        psubw   xmm2,xmm0
-        pmulhw  xmm2,[rel PW_F0382] ; xmm2=z5
-        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm2               ; xmm4=z2
-        paddw   xmm0,xmm2               ; xmm0=z4
-
-        movdqa  xmm3,xmm6
-        psubw   xmm6,xmm5               ; xmm6=z13
-        paddw   xmm3,xmm5               ; xmm3=z11
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm5,xmm3
-        psubw   xmm6,xmm4               ; xmm6=data3
-        psubw   xmm3,xmm0               ; xmm3=data7
-        paddw   xmm2,xmm4               ; xmm2=data5
-        paddw   xmm5,xmm0               ; xmm5=data1
-
-        ; ---- Pass 2: process columns.
-
-        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
-        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
-
-        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
-        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
-        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
-        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
-        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm3,xmm1
-        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
-        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
-        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
-        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
-
-        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm7,xmm6
-        movdqa  xmm0,xmm2
-        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
-        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
-        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
-        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm1,xmm5
-        psubw   xmm3,xmm6               ; xmm3=tmp13
-        psubw   xmm5,xmm2               ; xmm5=tmp12
-        paddw   xmm4,xmm6               ; xmm4=tmp10
-        paddw   xmm1,xmm2               ; xmm1=tmp11
-
-        paddw   xmm5,xmm3
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F0707] ; xmm5=z1
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm2,xmm3
-        psubw   xmm4,xmm1               ; xmm4=data4
-        psubw   xmm3,xmm5               ; xmm3=data6
-        paddw   xmm6,xmm1               ; xmm6=data0
-        paddw   xmm2,xmm5               ; xmm2=data2
-
-        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-        ; -- Odd part
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        paddw   xmm7,xmm0               ; xmm7=tmp10
-        paddw   xmm0,xmm1               ; xmm0=tmp11
-        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
-
-        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm0,[rel PW_F0707] ; xmm0=z3
-
-        movdqa  xmm4,xmm7               ; xmm4=tmp10
-        psubw   xmm7,xmm1
-        pmulhw  xmm7,[rel PW_F0382] ; xmm7=z5
-        pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm7               ; xmm4=z2
-        paddw   xmm1,xmm7               ; xmm1=z4
-
-        movdqa  xmm3,xmm5
-        psubw   xmm5,xmm0               ; xmm5=z13
-        paddw   xmm3,xmm0               ; xmm3=z11
-
-        movdqa  xmm6,xmm5
-        movdqa  xmm2,xmm3
-        psubw   xmm5,xmm4               ; xmm5=data3
-        psubw   xmm3,xmm1               ; xmm3=data7
-        paddw   xmm6,xmm4               ; xmm6=data5
-        paddw   xmm2,xmm1               ; xmm2=data1
-
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
-        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctfst-sse2.asm b/media/libjpeg/simd/jfdctfst-sse2.asm
deleted file mode 100644
index 54856a2363..0000000000
--- a/media/libjpeg/simd/jfdctfst-sse2.asm
+++ /dev/null
@@ -1,403 +0,0 @@
-;
-; jfdctfst.asm - fast integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the forward DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-
-%if CONST_BITS == 8
-F_0_382 equ      98             ; FIX(0.382683433)
-F_0_541 equ     139             ; FIX(0.541196100)
-F_0_707 equ     181             ; FIX(0.707106781)
-F_1_306 equ     334             ; FIX(1.306562965)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
-F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_fdct_ifast_sse2)
-
-EXTN(jconst_fdct_ifast_sse2):
-
-PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
-PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
-PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
-PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_ifast_sse2 (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_ifast_sse2)
-
-EXTN(jsimd_fdct_ifast_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        psubw   xmm3,xmm1               ; xmm3=tmp13
-        psubw   xmm6,xmm7               ; xmm6=tmp12
-        paddw   xmm4,xmm1               ; xmm4=tmp10
-        paddw   xmm0,xmm7               ; xmm0=tmp11
-
-        paddw   xmm6,xmm3
-        psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
-
-        movdqa  xmm1,xmm4
-        movdqa  xmm7,xmm3
-        psubw   xmm4,xmm0               ; xmm4=data4
-        psubw   xmm3,xmm6               ; xmm3=data6
-        paddw   xmm1,xmm0               ; xmm1=data0
-        paddw   xmm7,xmm6               ; xmm7=data2
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
-
-        ; -- Odd part
-
-        paddw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm5,xmm0               ; xmm5=tmp11
-        paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
-
-        movdqa  xmm4,xmm2               ; xmm4=tmp10
-        psubw   xmm2,xmm0
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm2               ; xmm4=z2
-        paddw   xmm0,xmm2               ; xmm0=z4
-
-        movdqa  xmm3,xmm6
-        psubw   xmm6,xmm5               ; xmm6=z13
-        paddw   xmm3,xmm5               ; xmm3=z11
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm5,xmm3
-        psubw   xmm6,xmm4               ; xmm6=data3
-        psubw   xmm3,xmm0               ; xmm3=data7
-        paddw   xmm2,xmm4               ; xmm2=data5
-        paddw   xmm5,xmm0               ; xmm5=data1
-
-        ; ---- Pass 2: process columns.
-
-;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
-        ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
-        movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
-
-        ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
-        ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
-        movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
-        punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
-        movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
-        movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm3,xmm1
-        psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
-        psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
-        paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
-        paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
-        movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
-
-        movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm7,xmm6
-        movdqa  xmm0,xmm2
-        paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
-        paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
-        psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
-        psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm1,xmm5
-        psubw   xmm3,xmm6               ; xmm3=tmp13
-        psubw   xmm5,xmm2               ; xmm5=tmp12
-        paddw   xmm4,xmm6               ; xmm4=tmp10
-        paddw   xmm1,xmm2               ; xmm1=tmp11
-
-        paddw   xmm5,xmm3
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm2,xmm3
-        psubw   xmm4,xmm1               ; xmm4=data4
-        psubw   xmm3,xmm5               ; xmm3=data6
-        paddw   xmm6,xmm1               ; xmm6=data0
-        paddw   xmm2,xmm5               ; xmm2=data2
-
-        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
-
-        ; -- Odd part
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        paddw   xmm7,xmm0               ; xmm7=tmp10
-        paddw   xmm0,xmm1               ; xmm0=tmp11
-        paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
-
-        psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
-
-        movdqa  xmm4,xmm7               ; xmm4=tmp10
-        psubw   xmm7,xmm1
-        pmulhw  xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
-        pmulhw  xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
-        paddw   xmm4,xmm7               ; xmm4=z2
-        paddw   xmm1,xmm7               ; xmm1=z4
-
-        movdqa  xmm3,xmm5
-        psubw   xmm5,xmm0               ; xmm5=z13
-        paddw   xmm3,xmm0               ; xmm3=z11
-
-        movdqa  xmm6,xmm5
-        movdqa  xmm2,xmm3
-        psubw   xmm5,xmm4               ; xmm5=data3
-        psubw   xmm3,xmm1               ; xmm3=data7
-        paddw   xmm6,xmm4               ; xmm6=data5
-        paddw   xmm2,xmm1               ; xmm2=data1
-
-        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
-        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
-        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
-        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctint-altivec.c b/media/libjpeg/simd/jfdctint-altivec.c
deleted file mode 100644
index e6e8a5687e..0000000000
--- a/media/libjpeg/simd/jfdctint-altivec.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * AltiVec optimizations for libjpeg-turbo
- *
- * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* SLOW INTEGER FORWARD DCT */
-
-#include "jsimd_altivec.h"
-
-
-#define F_0_298 2446   /* FIX(0.298631336) */
-#define F_0_390 3196   /* FIX(0.390180644) */
-#define F_0_541 4433   /* FIX(0.541196100) */
-#define F_0_765 6270   /* FIX(0.765366865) */
-#define F_0_899 7373   /* FIX(0.899976223) */
-#define F_1_175 9633   /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
-
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
-
-
-#define DO_FDCT_COMMON(PASS)  \
-{  \
-  /* (Original)  \
-   * z1 = (tmp12 + tmp13) * 0.541196100;  \
-   * data2 = z1 + tmp13 * 0.765366865;  \
-   * data6 = z1 + tmp12 * -1.847759065;  \
-   *  \
-   * (This implementation)  \
-   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
-   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
-   */  \
-  \
-  tmp1312l = vec_mergeh(tmp13, tmp12);  \
-  tmp1312h = vec_mergel(tmp13, tmp12);  \
-  \
-  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
-  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
-  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
-  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
-  \
-  out2l = vec_sra(out2l, descale_p##PASS);  \
-  out2h = vec_sra(out2h, descale_p##PASS);  \
-  out6l = vec_sra(out6l, descale_p##PASS);  \
-  out6h = vec_sra(out6h, descale_p##PASS);  \
-  \
-  out2 = vec_pack(out2l, out2h);  \
-  out6 = vec_pack(out6l, out6h);  \
-  \
-  /* Odd part */  \
-  \
-  z3 = vec_add(tmp4, tmp6);  \
-  z4 = vec_add(tmp5, tmp7);  \
-  \
-  /* (Original)  \
-   * z5 = (z3 + z4) * 1.175875602;  \
-   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
-   * z3 += z5;  z4 += z5;  \
-   *  \
-   * (This implementation)  \
-   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
-   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
-   */  \
-  \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
-  \
-  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
-  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
-  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
-  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
-  \
-  /* (Original)  \
-   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
-   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
-   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
-   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
-   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
-   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
-   *  \
-   * (This implementation)  \
-   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
-   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
-   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
-   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
-   * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
-   * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
-   */  \
-  \
-  tmp47l = vec_mergeh(tmp4, tmp7);  \
-  tmp47h = vec_mergel(tmp4, tmp7);  \
-  \
-  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
-  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
-  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
-  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
-  \
-  out7l = vec_sra(out7l, descale_p##PASS);  \
-  out7h = vec_sra(out7h, descale_p##PASS);  \
-  out1l = vec_sra(out1l, descale_p##PASS);  \
-  out1h = vec_sra(out1h, descale_p##PASS);  \
-  \
-  out7 = vec_pack(out7l, out7h);  \
-  out1 = vec_pack(out1l, out1h);  \
-  \
-  tmp56l = vec_mergeh(tmp5, tmp6);  \
-  tmp56h = vec_mergel(tmp5, tmp6);  \
-  \
-  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
-  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
-  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
-  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
-  \
-  out5l = vec_sra(out5l, descale_p##PASS);  \
-  out5h = vec_sra(out5h, descale_p##PASS);  \
-  out3l = vec_sra(out3l, descale_p##PASS);  \
-  out3h = vec_sra(out3h, descale_p##PASS);  \
-  \
-  out5 = vec_pack(out5l, out5h);  \
-  out3 = vec_pack(out3l, out3h);  \
-}
-
-#define DO_FDCT_PASS1()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_sl(out0, pass1_bits);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_sl(out4, pass1_bits);  \
-  \
-  DO_FDCT_COMMON(1);  \
-}
-
-#define DO_FDCT_PASS2()  \
-{  \
-  /* Even part */  \
-  \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
-  \
-  out0  = vec_add(tmp10, tmp11);  \
-  out0  = vec_add(out0, pw_descale_p2x);  \
-  out0  = vec_sra(out0, pass1_bits);  \
-  out4  = vec_sub(tmp10, tmp11);  \
-  out4  = vec_add(out4, pw_descale_p2x);  \
-  out4  = vec_sra(out4, pass1_bits);  \
-  \
-  DO_FDCT_COMMON(2);  \
-}
-
-
-void
-jsimd_fdct_islow_altivec (DCTELEM *data)
-{
-  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
-    col0, col1, col2, col3, col4, col5, col6, col7,
-    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
-    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
-    z3, z4, z34l, z34h,
-    out0, out1, out2, out3, out4, out5, out6, out7;
-  __vector int z3l, z3h, z4l, z4h,
-    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
-    out7l, out7h;
-
-  /* Constants */
-  __vector short
-    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
-    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
-    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
-    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
-    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
-    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
-    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
-    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
-    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
-  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
-  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
-    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
-  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
-    descale_p2 = { __4X(DESCALE_P2) };
-
-  /* Pass 1: process rows */
-
-  row0 = vec_ld(0, data);
-  row1 = vec_ld(16, data);
-  row2 = vec_ld(32, data);
-  row3 = vec_ld(48, data);
-  row4 = vec_ld(64, data);
-  row5 = vec_ld(80, data);
-  row6 = vec_ld(96, data);
-  row7 = vec_ld(112, data);
-
-  TRANSPOSE(row, col);
-
-  tmp0 = vec_add(col0, col7);
-  tmp7 = vec_sub(col0, col7);
-  tmp1 = vec_add(col1, col6);
-  tmp6 = vec_sub(col1, col6);
-  tmp2 = vec_add(col2, col5);
-  tmp5 = vec_sub(col2, col5);
-  tmp3 = vec_add(col3, col4);
-  tmp4 = vec_sub(col3, col4);
-
-  DO_FDCT_PASS1();
-
-  /* Pass 2: process columns */
-
-  TRANSPOSE(out, row);
-
-  tmp0 = vec_add(row0, row7);
-  tmp7 = vec_sub(row0, row7);
-  tmp1 = vec_add(row1, row6);
-  tmp6 = vec_sub(row1, row6);
-  tmp2 = vec_add(row2, row5);
-  tmp5 = vec_sub(row2, row5);
-  tmp3 = vec_add(row3, row4);
-  tmp4 = vec_sub(row3, row4);
-
-  DO_FDCT_PASS2();
-
-  vec_st(out0, 0, data);
-  vec_st(out1, 16, data);
-  vec_st(out2, 32, data);
-  vec_st(out3, 48, data);
-  vec_st(out4, 64, data);
-  vec_st(out5, 80, data);
-  vec_st(out6, 96, data);
-  vec_st(out7, 112, data);
-}
diff --git a/media/libjpeg/simd/jfdctint-mmx.asm b/media/libjpeg/simd/jfdctint-mmx.asm
deleted file mode 100644
index 9142ad8816..0000000000
--- a/media/libjpeg/simd/jfdctint-mmx.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_mmx)
-
-EXTN(jconst_fdct_islow_mmx):
-
-PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 4 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_mmx (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_mmx)
-
-EXTN(jsimd_fdct_islow_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.rowloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
-        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
-        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
-        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
-        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
-        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
-        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
-        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
-        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        paddw   mm5,mm7                 ; mm5=tmp10
-        paddw   mm0,mm4                 ; mm0=tmp11
-        psubw   mm1,mm7                 ; mm1=tmp13
-        psubw   mm6,mm4                 ; mm6=tmp12
-
-        movq    mm7,mm5
-        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
-        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
-
-        psllw   mm5,PASS1_BITS          ; mm5=data0
-        psllw   mm7,PASS1_BITS          ; mm7=data4
-
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movq      mm4,mm1               ; mm1=tmp13
-        movq      mm0,mm1
-        punpcklwd mm4,mm6               ; mm6=tmp12
-        punpckhwd mm0,mm6
-        movq      mm1,mm4
-        movq      mm6,mm0
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm4,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm1,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm4,mm0               ; mm4=data2
-        packssdw  mm1,mm6               ; mm1=data6
-
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
-
-        ; -- Odd part
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
-
-        movq    mm0,mm2                 ; mm2=tmp4
-        movq    mm6,mm3                 ; mm3=tmp5
-        paddw   mm0,mm5                 ; mm0=z3
-        paddw   mm6,mm7                 ; mm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm4,mm0
-        movq      mm1,mm0
-        punpcklwd mm4,mm6
-        punpckhwd mm1,mm6
-        movq      mm0,mm4
-        movq      mm6,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
-        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movq      mm4,mm2
-        movq      mm1,mm2
-        punpcklwd mm4,mm7
-        punpckhwd mm1,mm7
-        movq      mm2,mm4
-        movq      mm7,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
-
-        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
-        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
-        paddd   mm2,mm0                 ; mm2=data1L
-        paddd   mm7,mm6                 ; mm7=data1H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm4,DESCALE_P1
-        psrad   mm1,DESCALE_P1
-        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm2,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-
-        packssdw  mm4,mm1               ; mm4=data7
-        packssdw  mm2,mm7               ; mm2=data1
-
-        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-        movq      mm1,mm3
-        movq      mm7,mm3
-        punpcklwd mm1,mm5
-        punpckhwd mm7,mm5
-        movq      mm3,mm1
-        movq      mm5,mm7
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
-
-        paddd   mm1,mm0                 ; mm1=data5L
-        paddd   mm7,mm6                 ; mm7=data5H
-        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
-        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
-
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm1,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   mm3,DESCALE_P1
-        psrad   mm5,DESCALE_P1
-
-        packssdw  mm1,mm7               ; mm1=data5
-        packssdw  mm3,mm5               ; mm3=data3
-
-        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .rowloop
-
-        ; ---- Pass 2: process columns.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.columnloop:
-
-        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
-        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
-
-        movq      mm4,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
-        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
-        movq      mm5,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
-        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
-
-        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-
-        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
-        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
-
-        movq      mm4,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
-        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
-        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
-        movq      mm3,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
-        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
-
-        movq    mm0,mm7
-        movq    mm5,mm6
-        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
-        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
-        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
-        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
-
-        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
-        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
-
-        movq      mm7,mm4               ; transpose coefficients(phase 2)
-        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
-        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
-        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
-
-        movq    mm2,mm7
-        movq    mm3,mm4
-        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
-        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
-        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
-        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movq    mm1,mm5
-        movq    mm6,mm0
-        paddw   mm5,mm7                 ; mm5=tmp10
-        paddw   mm0,mm4                 ; mm0=tmp11
-        psubw   mm1,mm7                 ; mm1=tmp13
-        psubw   mm6,mm4                 ; mm6=tmp12
-
-        movq    mm7,mm5
-        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
-        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
-
-        paddw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        psraw   mm5,PASS1_BITS          ; mm5=data0
-        psraw   mm7,PASS1_BITS          ; mm7=data4
-
-        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
-        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movq      mm4,mm1               ; mm1=tmp13
-        movq      mm0,mm1
-        punpcklwd mm4,mm6               ; mm6=tmp12
-        punpckhwd mm0,mm6
-        movq      mm1,mm4
-        movq      mm6,mm0
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm4,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm1,DESCALE_P2
-        psrad   mm6,DESCALE_P2
-
-        packssdw  mm4,mm0               ; mm4=data2
-        packssdw  mm1,mm6               ; mm1=data6
-
-        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
-
-        ; -- Odd part
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
-
-        movq    mm0,mm2                 ; mm2=tmp4
-        movq    mm6,mm3                 ; mm3=tmp5
-        paddw   mm0,mm5                 ; mm0=z3
-        paddw   mm6,mm7                 ; mm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm4,mm0
-        movq      mm1,mm0
-        punpcklwd mm4,mm6
-        punpckhwd mm1,mm6
-        movq      mm0,mm4
-        movq      mm6,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
-
-        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
-        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movq      mm4,mm2
-        movq      mm1,mm2
-        punpcklwd mm4,mm7
-        punpckhwd mm1,mm7
-        movq      mm2,mm4
-        movq      mm7,mm1
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
-
-        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
-        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
-        paddd   mm2,mm0                 ; mm2=data1L
-        paddd   mm7,mm6                 ; mm7=data1H
-
-        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm4,DESCALE_P2
-        psrad   mm1,DESCALE_P2
-        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm2,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-
-        packssdw  mm4,mm1               ; mm4=data7
-        packssdw  mm2,mm7               ; mm2=data1
-
-        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
-
-        movq      mm1,mm3
-        movq      mm7,mm3
-        punpcklwd mm1,mm5
-        punpckhwd mm7,mm5
-        movq      mm3,mm1
-        movq      mm5,mm7
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
-
-        paddd   mm1,mm0                 ; mm1=data5L
-        paddd   mm7,mm6                 ; mm7=data5H
-        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
-        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
-
-        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm1,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   mm3,DESCALE_P2
-        psrad   mm5,DESCALE_P2
-
-        packssdw  mm1,mm7               ; mm1=data5
-        packssdw  mm3,mm5               ; mm3=data3
-
-        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
-
-        add     edx, byte 4*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     near .columnloop
-
-        emms            ; empty MMX state
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctint-sse2-64.asm b/media/libjpeg/simd/jfdctint-sse2-64.asm
deleted file mode 100644
index 9a0ca0fd28..0000000000
--- a/media/libjpeg/simd/jfdctint-sse2-64.asm
+++ /dev/null
@@ -1,621 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-; r10 = DCTELEM *data
-
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          6
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process rows.
-
-        mov     rdx, r10        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        paddw   xmm3,xmm1               ; xmm3=tmp10
-        paddw   xmm6,xmm7               ; xmm6=tmp11
-        psubw   xmm4,xmm1               ; xmm4=tmp13
-        psubw   xmm0,xmm7               ; xmm0=tmp12
-
-        movdqa  xmm1,xmm3
-        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
-        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
-
-        psllw   xmm3,PASS1_BITS         ; xmm3=data0
-        psllw   xmm1,PASS1_BITS         ; xmm1=data4
-
-        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
-        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm7,xmm4             ; xmm4=tmp13
-        movdqa    xmm6,xmm4
-        punpcklwd xmm7,xmm0             ; xmm0=tmp12
-        punpckhwd xmm6,xmm0
-        movdqa    xmm4,xmm7
-        movdqa    xmm0,xmm6
-        pmaddwd   xmm7,[rel PW_F130_F054]       ; xmm7=data2L
-        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=data2H
-        pmaddwd   xmm4,[rel PW_F054_MF130]      ; xmm4=data6L
-        pmaddwd   xmm0,[rel PW_F054_MF130]      ; xmm0=data6H
-
-        paddd   xmm7,[rel PD_DESCALE_P1]
-        paddd   xmm6,[rel PD_DESCALE_P1]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        paddd   xmm0,[rel PD_DESCALE_P1]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm7,xmm6             ; xmm7=data2
-        packssdw  xmm4,xmm0             ; xmm4=data6
-
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
-
-        ; -- Odd part
-
-        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
-
-        movdqa  xmm6,xmm2               ; xmm2=tmp4
-        movdqa  xmm0,xmm5               ; xmm5=tmp5
-        paddw   xmm6,xmm3               ; xmm6=z3
-        paddw   xmm0,xmm1               ; xmm0=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm7,xmm6
-        movdqa    xmm4,xmm6
-        punpcklwd xmm7,xmm0
-        punpckhwd xmm4,xmm0
-        movdqa    xmm6,xmm7
-        movdqa    xmm0,xmm4
-        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3L
-        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3H
-        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4L
-        pmaddwd   xmm0,[rel PW_F117_F078]       ; xmm0=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm7,xmm2
-        movdqa    xmm4,xmm2
-        punpcklwd xmm7,xmm1
-        punpckhwd xmm4,xmm1
-        movdqa    xmm2,xmm7
-        movdqa    xmm1,xmm4
-        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp4L
-        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4H
-        pmaddwd   xmm2,[rel PW_MF089_F060]      ; xmm2=tmp7L
-        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp7H
-
-        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
-        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
-        paddd   xmm2,xmm6               ; xmm2=data1L
-        paddd   xmm1,xmm0               ; xmm1=data1H
-
-        paddd   xmm7,[rel PD_DESCALE_P1]
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm2,[rel PD_DESCALE_P1]
-        paddd   xmm1,[rel PD_DESCALE_P1]
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-
-        packssdw  xmm7,xmm4             ; xmm7=data7
-        packssdw  xmm2,xmm1             ; xmm2=data1
-
-        movdqa    xmm4,xmm5
-        movdqa    xmm1,xmm5
-        punpcklwd xmm4,xmm3
-        punpckhwd xmm1,xmm3
-        movdqa    xmm5,xmm4
-        movdqa    xmm3,xmm1
-        pmaddwd   xmm4,[rel PW_MF050_MF256]     ; xmm4=tmp5L
-        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5H
-        pmaddwd   xmm5,[rel PW_MF256_F050]      ; xmm5=tmp6L
-        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6H
-
-        paddd   xmm4,xmm6               ; xmm4=data5L
-        paddd   xmm1,xmm0               ; xmm1=data5H
-        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
-        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
-
-        paddd   xmm4,[rel PD_DESCALE_P1]
-        paddd   xmm1,[rel PD_DESCALE_P1]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-        paddd   xmm5,[rel PD_DESCALE_P1]
-        paddd   xmm3,[rel PD_DESCALE_P1]
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-
-        packssdw  xmm4,xmm1             ; xmm4=data5
-        packssdw  xmm5,xmm3             ; xmm5=data3
-
-        ; ---- Pass 2: process columns.
-
-        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
-        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
-
-        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
-        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
-        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
-
-        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
-        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
-        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm7,xmm6
-        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
-        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
-        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
-
-        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
-        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm3,xmm4
-        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
-        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
-        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm6,xmm2
-        paddw   xmm7,xmm5               ; xmm7=tmp10
-        paddw   xmm2,xmm4               ; xmm2=tmp11
-        psubw   xmm1,xmm5               ; xmm1=tmp13
-        psubw   xmm6,xmm4               ; xmm6=tmp12
-
-        movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
-
-        paddw   xmm7,[rel PW_DESCALE_P2X]
-        paddw   xmm5,[rel PW_DESCALE_P2X]
-        psraw   xmm7,PASS1_BITS         ; xmm7=data0
-        psraw   xmm5,PASS1_BITS         ; xmm5=data4
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
-        movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm4,xmm1             ; xmm1=tmp13
-        movdqa    xmm2,xmm1
-        punpcklwd xmm4,xmm6             ; xmm6=tmp12
-        punpckhwd xmm2,xmm6
-        movdqa    xmm1,xmm4
-        movdqa    xmm6,xmm2
-        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=data2L
-        pmaddwd   xmm2,[rel PW_F130_F054]       ; xmm2=data2H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=data6L
-        pmaddwd   xmm6,[rel PW_F054_MF130]      ; xmm6=data6H
-
-        paddd   xmm4,[rel PD_DESCALE_P2]
-        paddd   xmm2,[rel PD_DESCALE_P2]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        paddd   xmm6,[rel PD_DESCALE_P2]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm6,DESCALE_P2
-
-        packssdw  xmm4,xmm2             ; xmm4=data2
-        packssdw  xmm1,xmm6             ; xmm1=data6
-
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
-
-        ; -- Odd part
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        movdqa  xmm2,xmm0               ; xmm0=tmp4
-        movdqa  xmm6,xmm3               ; xmm3=tmp5
-        paddw   xmm2,xmm7               ; xmm2=z3
-        paddw   xmm6,xmm5               ; xmm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm1,xmm2
-        punpcklwd xmm4,xmm6
-        punpckhwd xmm1,xmm6
-        movdqa    xmm2,xmm4
-        movdqa    xmm6,xmm1
-        pmaddwd   xmm4,[rel PW_MF078_F117]      ; xmm4=z3L
-        pmaddwd   xmm1,[rel PW_MF078_F117]      ; xmm1=z3H
-        pmaddwd   xmm2,[rel PW_F117_F078]       ; xmm2=z4L
-        pmaddwd   xmm6,[rel PW_F117_F078]       ; xmm6=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm1,xmm0
-        punpcklwd xmm4,xmm5
-        punpckhwd xmm1,xmm5
-        movdqa    xmm0,xmm4
-        movdqa    xmm5,xmm1
-        pmaddwd   xmm4,[rel PW_MF060_MF089]     ; xmm4=tmp4L
-        pmaddwd   xmm1,[rel PW_MF060_MF089]     ; xmm1=tmp4H
-        pmaddwd   xmm0,[rel PW_MF089_F060]      ; xmm0=tmp7L
-        pmaddwd   xmm5,[rel PW_MF089_F060]      ; xmm5=tmp7H
-
-        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
-        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
-        paddd   xmm0,xmm2               ; xmm0=data1L
-        paddd   xmm5,xmm6               ; xmm5=data1H
-
-        paddd   xmm4,[rel PD_DESCALE_P2]
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm0,[rel PD_DESCALE_P2]
-        paddd   xmm5,[rel PD_DESCALE_P2]
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-
-        packssdw  xmm4,xmm1             ; xmm4=data7
-        packssdw  xmm0,xmm5             ; xmm0=data1
-
-        movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
-
-        movdqa    xmm1,xmm3
-        movdqa    xmm5,xmm3
-        punpcklwd xmm1,xmm7
-        punpckhwd xmm5,xmm7
-        movdqa    xmm3,xmm1
-        movdqa    xmm7,xmm5
-        pmaddwd   xmm1,[rel PW_MF050_MF256]     ; xmm1=tmp5L
-        pmaddwd   xmm5,[rel PW_MF050_MF256]     ; xmm5=tmp5H
-        pmaddwd   xmm3,[rel PW_MF256_F050]      ; xmm3=tmp6L
-        pmaddwd   xmm7,[rel PW_MF256_F050]      ; xmm7=tmp6H
-
-        paddd   xmm1,xmm2               ; xmm1=data5L
-        paddd   xmm5,xmm6               ; xmm5=data5H
-        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
-        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
-
-        paddd   xmm1,[rel PD_DESCALE_P2]
-        paddd   xmm5,[rel PD_DESCALE_P2]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-        paddd   xmm3,[rel PD_DESCALE_P2]
-        paddd   xmm7,[rel PD_DESCALE_P2]
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm1,xmm5             ; xmm1=data5
-        packssdw  xmm3,xmm7             ; xmm3=data3
-
-        movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jfdctint-sse2.asm b/media/libjpeg/simd/jfdctint-sse2.asm
deleted file mode 100644
index db9d0bbe44..0000000000
--- a/media/libjpeg/simd/jfdctint-sse2.asm
+++ /dev/null
@@ -1,633 +0,0 @@
-;
-; jfdctint.asm - accurate integer FDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; forward DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jfdctint.c; see the jfdctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_fdct_islow_sse2)
-
-EXTN(jconst_fdct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PW_DESCALE_P2X  times 8 dw  1 << (PASS1_BITS-1)
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform the forward DCT on one block of samples.
-;
-; GLOBAL(void)
-; jsimd_fdct_islow_sse2 (DCTELEM *data)
-;
-
-%define data(b)         (b)+8           ; DCTELEM *data
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          6
-
-        align   16
-        global  EXTN(jsimd_fdct_islow_sse2)
-
-EXTN(jsimd_fdct_islow_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-;       push    edi             ; unused
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process rows.
-
-        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
-        ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
-
-        ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
-        ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
-        movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
-        movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
-
-        movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
-        movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
-        movdqa  XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
-
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
-        movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
-        punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
-        movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
-        punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
-
-        movdqa  xmm6,xmm1
-        movdqa  xmm3,xmm0
-        psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
-        psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
-        paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
-        paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
-
-        movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa  xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
-        movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
-
-        movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
-        punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
-        movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
-        punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
-
-        movdqa  xmm2,xmm1
-        movdqa  xmm5,xmm7
-        paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
-        paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
-        psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
-        psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm0,xmm6
-        paddw   xmm3,xmm1               ; xmm3=tmp10
-        paddw   xmm6,xmm7               ; xmm6=tmp11
-        psubw   xmm4,xmm1               ; xmm4=tmp13
-        psubw   xmm0,xmm7               ; xmm0=tmp12
-
-        movdqa  xmm1,xmm3
-        paddw   xmm3,xmm6               ; xmm3=tmp10+tmp11
-        psubw   xmm1,xmm6               ; xmm1=tmp10-tmp11
-
-        psllw   xmm3,PASS1_BITS         ; xmm3=data0
-        psllw   xmm1,PASS1_BITS         ; xmm1=data4
-
-        movdqa  XMMWORD [wk(2)], xmm3   ; wk(2)=data0
-        movdqa  XMMWORD [wk(3)], xmm1   ; wk(3)=data4
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm7,xmm4             ; xmm4=tmp13
-        movdqa    xmm6,xmm4
-        punpcklwd xmm7,xmm0             ; xmm0=tmp12
-        punpckhwd xmm6,xmm0
-        movdqa    xmm4,xmm7
-        movdqa    xmm0,xmm6
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F130_F054)]       ; xmm7=data2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=data2H
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm4=data6L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm0=data6H
-
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm7,xmm6             ; xmm7=data2
-        packssdw  xmm4,xmm0             ; xmm4=data6
-
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=data2
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=data6
-
-        ; -- Odd part
-
-        movdqa  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
-
-        movdqa  xmm6,xmm2               ; xmm2=tmp4
-        movdqa  xmm0,xmm5               ; xmm5=tmp5
-        paddw   xmm6,xmm3               ; xmm6=z3
-        paddw   xmm0,xmm1               ; xmm0=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm7,xmm6
-        movdqa    xmm4,xmm6
-        punpcklwd xmm7,xmm0
-        punpckhwd xmm4,xmm0
-        movdqa    xmm6,xmm7
-        movdqa    xmm0,xmm4
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3H
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F117_F078)]       ; xmm0=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm7,xmm2
-        movdqa    xmm4,xmm2
-        punpcklwd xmm7,xmm1
-        punpckhwd xmm4,xmm1
-        movdqa    xmm2,xmm7
-        movdqa    xmm1,xmm4
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp4L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm2=tmp7L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp7H
-
-        paddd   xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
-        paddd   xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
-        paddd   xmm2,xmm6               ; xmm2=data1L
-        paddd   xmm1,xmm0               ; xmm1=data1H
-
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-
-        packssdw  xmm7,xmm4             ; xmm7=data7
-        packssdw  xmm2,xmm1             ; xmm2=data1
-
-        movdqa    xmm4,xmm5
-        movdqa    xmm1,xmm5
-        punpcklwd xmm4,xmm3
-        punpckhwd xmm1,xmm3
-        movdqa    xmm5,xmm4
-        movdqa    xmm3,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm4=tmp5L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm5=tmp6L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6H
-
-        paddd   xmm4,xmm6               ; xmm4=data5L
-        paddd   xmm1,xmm0               ; xmm1=data5H
-        paddd   xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
-        paddd   xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm1,DESCALE_P1
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P1)]
-        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-
-        packssdw  xmm4,xmm1             ; xmm4=data5
-        packssdw  xmm5,xmm3             ; xmm5=data3
-
-        ; ---- Pass 2: process columns.
-
-;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
-
-        movdqa  xmm6, XMMWORD [wk(2)]   ; xmm6=col0
-        movdqa  xmm0, XMMWORD [wk(4)]   ; xmm0=col2
-
-        ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
-        ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
-
-        movdqa    xmm1,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm2             ; xmm6=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm1,xmm2             ; xmm1=(40 41 50 51 60 61 70 71)
-        movdqa    xmm3,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(02 03 12 13 22 23 32 33)
-        punpckhwd xmm3,xmm5             ; xmm3=(42 43 52 53 62 63 72 73)
-
-        movdqa  xmm2, XMMWORD [wk(3)]   ; xmm2=col4
-        movdqa  xmm5, XMMWORD [wk(5)]   ; xmm5=col6
-
-        ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
-        ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
-
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm4             ; xmm2=(04 05 14 15 24 25 34 35)
-        punpckhwd xmm0,xmm4             ; xmm0=(44 45 54 55 64 65 74 75)
-        movdqa    xmm3,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm7             ; xmm5=(06 07 16 17 26 27 36 37)
-        punpckhwd xmm3,xmm7             ; xmm3=(46 47 56 57 66 67 76 77)
-
-        movdqa    xmm4,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm5             ; xmm2=(04 05 06 07 14 15 16 17)
-        punpckhdq xmm4,xmm5             ; xmm4=(24 25 26 27 34 35 36 37)
-        movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(44 45 46 47 54 55 56 57)
-        punpckhdq xmm7,xmm3             ; xmm7=(64 65 66 67 74 75 76 77)
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
-        movdqa  XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
-        movdqa  XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
-
-        movdqa    xmm4,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm5             ; xmm6=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm4,xmm5             ; xmm4=(20 21 22 23 30 31 32 33)
-        movdqa    xmm0,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm3             ; xmm1=(40 41 42 43 50 51 52 53)
-        punpckhdq xmm0,xmm3             ; xmm0=(60 61 62 63 70 71 72 73)
-
-        movdqa     xmm5,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm2            ; xmm6=(00 01 02 03 04 05 06 07)=data0
-        punpckhqdq xmm5,xmm2            ; xmm5=(10 11 12 13 14 15 16 17)=data1
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm7            ; xmm0=(60 61 62 63 64 65 66 67)=data6
-        punpckhqdq xmm3,xmm7            ; xmm3=(70 71 72 73 74 75 76 77)=data7
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm7,xmm6
-        psubw   xmm5,xmm0               ; xmm5=data1-data6=tmp6
-        psubw   xmm6,xmm3               ; xmm6=data0-data7=tmp7
-        paddw   xmm2,xmm0               ; xmm2=data1+data6=tmp1
-        paddw   xmm7,xmm3               ; xmm7=data0+data7=tmp0
-
-        movdqa  xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
-        movdqa  xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
-
-        movdqa     xmm5,xmm4            ; transpose coefficients(phase 3)
-        punpcklqdq xmm4,xmm0            ; xmm4=(20 21 22 23 24 25 26 27)=data2
-        punpckhqdq xmm5,xmm0            ; xmm5=(30 31 32 33 34 35 36 37)=data3
-        movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm3            ; xmm1=(40 41 42 43 44 45 46 47)=data4
-        punpckhqdq xmm6,xmm3            ; xmm6=(50 51 52 53 54 55 56 57)=data5
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm3,xmm4
-        paddw   xmm5,xmm1               ; xmm5=data3+data4=tmp3
-        paddw   xmm4,xmm6               ; xmm4=data2+data5=tmp2
-        psubw   xmm0,xmm1               ; xmm0=data3-data4=tmp4
-        psubw   xmm3,xmm6               ; xmm3=data2-data5=tmp5
-
-        ; -- Even part
-
-        movdqa  xmm1,xmm7
-        movdqa  xmm6,xmm2
-        paddw   xmm7,xmm5               ; xmm7=tmp10
-        paddw   xmm2,xmm4               ; xmm2=tmp11
-        psubw   xmm1,xmm5               ; xmm1=tmp13
-        psubw   xmm6,xmm4               ; xmm6=tmp12
-
-        movdqa  xmm5,xmm7
-        paddw   xmm7,xmm2               ; xmm7=tmp10+tmp11
-        psubw   xmm5,xmm2               ; xmm5=tmp10-tmp11
-
-        paddw   xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        paddw   xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
-        psraw   xmm7,PASS1_BITS         ; xmm7=data0
-        psraw   xmm5,PASS1_BITS         ; xmm5=data4
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7
-        movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5
-
-        ; (Original)
-        ; z1 = (tmp12 + tmp13) * 0.541196100;
-        ; data2 = z1 + tmp13 * 0.765366865;
-        ; data6 = z1 + tmp12 * -1.847759065;
-        ;
-        ; (This implementation)
-        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
-        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
-
-        movdqa    xmm4,xmm1             ; xmm1=tmp13
-        movdqa    xmm2,xmm1
-        punpcklwd xmm4,xmm6             ; xmm6=tmp12
-        punpckhwd xmm2,xmm6
-        movdqa    xmm1,xmm4
-        movdqa    xmm6,xmm2
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=data2L
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F130_F054)]       ; xmm2=data2H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=data6L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm6=data6H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm6,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm6,DESCALE_P2
-
-        packssdw  xmm4,xmm2             ; xmm4=data2
-        packssdw  xmm1,xmm6             ; xmm1=data6
-
-        movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1
-
-        ; -- Odd part
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
-
-        movdqa  xmm2,xmm0               ; xmm0=tmp4
-        movdqa  xmm6,xmm3               ; xmm3=tmp5
-        paddw   xmm2,xmm7               ; xmm2=z3
-        paddw   xmm6,xmm5               ; xmm6=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm4,xmm2
-        movdqa    xmm1,xmm2
-        punpcklwd xmm4,xmm6
-        punpckhwd xmm1,xmm6
-        movdqa    xmm2,xmm4
-        movdqa    xmm6,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm4=z3L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm1=z3H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F117_F078)]       ; xmm2=z4L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F117_F078)]       ; xmm6=z4H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
-        movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
-
-        ; (Original)
-        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
-        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
-        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
-        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
-        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
-        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
-        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
-        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
-        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm1,xmm0
-        punpcklwd xmm4,xmm5
-        punpckhwd xmm1,xmm5
-        movdqa    xmm0,xmm4
-        movdqa    xmm5,xmm1
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm4=tmp4L
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm1=tmp4H
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm0=tmp7L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm5=tmp7H
-
-        paddd   xmm4, XMMWORD [wk(0)]   ; xmm4=data7L
-        paddd   xmm1, XMMWORD [wk(1)]   ; xmm1=data7H
-        paddd   xmm0,xmm2               ; xmm0=data1L
-        paddd   xmm5,xmm6               ; xmm5=data1H
-
-        paddd   xmm4,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm0,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-
-        packssdw  xmm4,xmm1             ; xmm4=data7
-        packssdw  xmm0,xmm5             ; xmm0=data1
-
-        movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4
-        movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0
-
-        movdqa    xmm1,xmm3
-        movdqa    xmm5,xmm3
-        punpcklwd xmm1,xmm7
-        punpckhwd xmm5,xmm7
-        movdqa    xmm3,xmm1
-        movdqa    xmm7,xmm5
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm1=tmp5L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm5=tmp5H
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm3=tmp6L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm7=tmp6H
-
-        paddd   xmm1,xmm2               ; xmm1=data5L
-        paddd   xmm5,xmm6               ; xmm5=data5H
-        paddd   xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
-        paddd   xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
-
-        paddd   xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm1,DESCALE_P2
-        psrad   xmm5,DESCALE_P2
-        paddd   xmm3,[GOTOFF(ebx,PD_DESCALE_P2)]
-        paddd   xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm1,xmm5             ; xmm1=data5
-        packssdw  xmm3,xmm7             ; xmm3=data3
-
-        movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3
-
-;       pop     edi             ; unused
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-3dn.asm b/media/libjpeg/simd/jidctflt-3dn.asm
deleted file mode 100644
index 99356f20a4..0000000000
--- a/media/libjpeg/simd/jidctflt-3dn.asm
+++ /dev/null
@@ -1,451 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_3dnow)
-
-EXTN(jconst_idct_float_3dnow):
-
-PD_1_414        times 2 dd  1.414213562373095048801689
-PD_1_847        times 2 dd  1.847759065022573512256366
-PD_1_082        times 2 dd  1.082392200292393968799446
-PD_2_613        times 2 dd  2.613125929752753055713286
-PD_RNDINT_MAGIC times 2 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_3dnow (void *dct_table, JCOEFPTR coef_block,
-;                         JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_3dnow)
-
-EXTN(jsimd_idct_float_3dnow):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/2                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        pushpic ebx             ; save GOT address
-        mov     ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        mov     eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        or      ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        or      ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        or      eax,ebx
-        poppic  ebx             ; restore GOT address
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm0,mm0
-        psrad     mm0,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm0,mm0
-
-        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0
-        punpckhdq mm1,mm1
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movd      mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movd      mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movd      mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movd      mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm0,mm0
-        punpcklwd mm1,mm1
-        psrad     mm0,(DWORD_BIT-WORD_BIT)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm0,mm0
-        pi2fd     mm1,mm1
-
-        pfmul     mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        punpcklwd mm2,mm2
-        punpcklwd mm3,mm3
-        psrad     mm2,(DWORD_BIT-WORD_BIT)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm2,mm2
-        pi2fd     mm3,mm3
-
-        pfmul     mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pfsub   mm0,mm2                 ; mm0=tmp11
-        pfsub   mm1,mm3
-        pfadd   mm4,mm2                 ; mm4=tmp10
-        pfadd   mm5,mm3                 ; mm5=tmp13
-
-        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
-        pfsub   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm5                 ; mm4=tmp3
-        pfsub   mm0,mm1                 ; mm0=tmp2
-        pfadd   mm6,mm5                 ; mm6=tmp0
-        pfadd   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; tmp3
-        movq    MMWORD [wk(0)], mm0     ; tmp2
-
-        ; -- Odd part
-
-        movd      mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movd      mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movd      mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movd      mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd mm2,mm2
-        punpcklwd mm3,mm3
-        psrad     mm2,(DWORD_BIT-WORD_BIT)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm2,mm2
-        pi2fd     mm3,mm3
-
-        pfmul     mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        punpcklwd mm5,mm5
-        punpcklwd mm1,mm1
-        psrad     mm5,(DWORD_BIT-WORD_BIT)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)
-        pi2fd     mm5,mm5
-        pi2fd     mm1,mm1
-
-        pfmul     mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        pfmul     mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        pfadd   mm2,mm1                 ; mm2=z11
-        pfadd   mm5,mm3                 ; mm5=z13
-        pfsub   mm4,mm1                 ; mm4=z12
-        pfsub   mm0,mm3                 ; mm0=z10
-
-        movq    mm1,mm2
-        pfsub   mm2,mm5
-        pfadd   mm1,mm5                 ; mm1=tmp7
-
-        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
-
-        movq    mm3,mm0
-        pfadd   mm0,mm4
-        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
-        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
-        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
-        pfsubr  mm3,mm0                 ; mm3=tmp12
-        pfsub   mm4,mm0                 ; mm4=tmp10
-
-        ; -- Final output stage
-
-        pfsub   mm3,mm1                 ; mm3=tmp6
-        movq    mm5,mm6
-        movq    mm0,mm7
-        pfadd   mm6,mm1                 ; mm6=data0=(00 01)
-        pfadd   mm7,mm3                 ; mm7=data1=(10 11)
-        pfsub   mm5,mm1                 ; mm5=data7=(70 71)
-        pfsub   mm0,mm3                 ; mm0=data6=(60 61)
-        pfsub   mm2,mm3                 ; mm2=tmp5
-
-        movq      mm1,mm6               ; transpose coefficients
-        punpckldq mm6,mm7               ; mm6=(00 10)
-        punpckhdq mm1,mm7               ; mm1=(01 11)
-        movq      mm3,mm0               ; transpose coefficients
-        punpckldq mm0,mm5               ; mm0=(60 70)
-        punpckhdq mm3,mm5               ; mm3=(61 71)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
-        movq    mm5, MMWORD [wk(1)]     ; mm5=tmp3
-
-        pfadd   mm4,mm2                 ; mm4=tmp4
-        movq    mm6,mm7
-        movq    mm1,mm5
-        pfadd   mm7,mm2                 ; mm7=data2=(20 21)
-        pfadd   mm5,mm4                 ; mm5=data4=(40 41)
-        pfsub   mm6,mm2                 ; mm6=data5=(50 51)
-        pfsub   mm1,mm4                 ; mm1=data3=(30 31)
-
-        movq      mm0,mm7               ; transpose coefficients
-        punpckldq mm7,mm1               ; mm7=(20 30)
-        punpckhdq mm0,mm1               ; mm0=(21 31)
-        movq      mm3,mm5               ; transpose coefficients
-        punpckldq mm5,mm6               ; mm5=(40 50)
-        punpckhdq mm3,mm6               ; mm3=(41 51)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
-
-.nextcolumn:
-        add     esi, byte 2*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 2*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/2                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pfsub   mm0,mm2                 ; mm0=tmp11
-        pfsub   mm1,mm3
-        pfadd   mm4,mm2                 ; mm4=tmp10
-        pfadd   mm5,mm3                 ; mm5=tmp13
-
-        pfmul   mm1,[GOTOFF(ebx,PD_1_414)]
-        pfsub   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        pfsub   mm4,mm5                 ; mm4=tmp3
-        pfsub   mm0,mm1                 ; mm0=tmp2
-        pfadd   mm6,mm5                 ; mm6=tmp0
-        pfadd   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; tmp3
-        movq    MMWORD [wk(0)], mm0     ; tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        pfadd   mm2,mm1                 ; mm2=z11
-        pfadd   mm5,mm3                 ; mm5=z13
-        pfsub   mm4,mm1                 ; mm4=z12
-        pfsub   mm0,mm3                 ; mm0=z10
-
-        movq    mm1,mm2
-        pfsub   mm2,mm5
-        pfadd   mm1,mm5                 ; mm1=tmp7
-
-        pfmul   mm2,[GOTOFF(ebx,PD_1_414)]      ; mm2=tmp11
-
-        movq    mm3,mm0
-        pfadd   mm0,mm4
-        pfmul   mm0,[GOTOFF(ebx,PD_1_847)]      ; mm0=z5
-        pfmul   mm3,[GOTOFF(ebx,PD_2_613)]      ; mm3=(z10 * 2.613125930)
-        pfmul   mm4,[GOTOFF(ebx,PD_1_082)]      ; mm4=(z12 * 1.082392200)
-        pfsubr  mm3,mm0                 ; mm3=tmp12
-        pfsub   mm4,mm0                 ; mm4=tmp10
-
-        ; -- Final output stage
-
-        pfsub   mm3,mm1                 ; mm3=tmp6
-        movq    mm5,mm6
-        movq    mm0,mm7
-        pfadd   mm6,mm1                 ; mm6=data0=(00 10)
-        pfadd   mm7,mm3                 ; mm7=data1=(01 11)
-        pfsub   mm5,mm1                 ; mm5=data7=(07 17)
-        pfsub   mm0,mm3                 ; mm0=data6=(06 16)
-        pfsub   mm2,mm3                 ; mm2=tmp5
-
-        movq    mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm1=[PD_RNDINT_MAGIC]
-        pcmpeqd mm3,mm3
-        psrld   mm3,WORD_BIT            ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
-
-        pfadd   mm6,mm1                 ; mm6=roundint(data0/8)=(00 ** 10 **)
-        pfadd   mm7,mm1                 ; mm7=roundint(data1/8)=(01 ** 11 **)
-        pfadd   mm0,mm1                 ; mm0=roundint(data6/8)=(06 ** 16 **)
-        pfadd   mm5,mm1                 ; mm5=roundint(data7/8)=(07 ** 17 **)
-
-        pand    mm6,mm3                 ; mm6=(00 -- 10 --)
-        pslld   mm7,WORD_BIT            ; mm7=(-- 01 -- 11)
-        pand    mm0,mm3                 ; mm0=(06 -- 16 --)
-        pslld   mm5,WORD_BIT            ; mm5=(-- 07 -- 17)
-        por     mm6,mm7                 ; mm6=(00 01 10 11)
-        por     mm0,mm5                 ; mm0=(06 07 16 17)
-
-        movq    mm1, MMWORD [wk(0)]     ; mm1=tmp2
-        movq    mm3, MMWORD [wk(1)]     ; mm3=tmp3
-
-        pfadd   mm4,mm2                 ; mm4=tmp4
-        movq    mm7,mm1
-        movq    mm5,mm3
-        pfadd   mm1,mm2                 ; mm1=data2=(02 12)
-        pfadd   mm3,mm4                 ; mm3=data4=(04 14)
-        pfsub   mm7,mm2                 ; mm7=data5=(05 15)
-        pfsub   mm5,mm4                 ; mm5=data3=(03 13)
-
-        movq    mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]       ; mm2=[PD_RNDINT_MAGIC]
-        pcmpeqd mm4,mm4
-        psrld   mm4,WORD_BIT            ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
-
-        pfadd   mm3,mm2                 ; mm3=roundint(data4/8)=(04 ** 14 **)
-        pfadd   mm7,mm2                 ; mm7=roundint(data5/8)=(05 ** 15 **)
-        pfadd   mm1,mm2                 ; mm1=roundint(data2/8)=(02 ** 12 **)
-        pfadd   mm5,mm2                 ; mm5=roundint(data3/8)=(03 ** 13 **)
-
-        pand    mm3,mm4                 ; mm3=(04 -- 14 --)
-        pslld   mm7,WORD_BIT            ; mm7=(-- 05 -- 15)
-        pand    mm1,mm4                 ; mm1=(02 -- 12 --)
-        pslld   mm5,WORD_BIT            ; mm5=(-- 03 -- 13)
-        por     mm3,mm7                 ; mm3=(04 05 14 15)
-        por     mm1,mm5                 ; mm1=(02 03 12 13)
-
-        movq      mm2,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm2=[PB_CENTERJSAMP]
-
-        packsswb  mm6,mm3               ; mm6=(00 01 10 11 04 05 14 15)
-        packsswb  mm1,mm0               ; mm1=(02 03 12 13 06 07 16 17)
-        paddb     mm6,mm2
-        paddb     mm1,mm2
-
-        movq      mm4,mm6               ; transpose coefficients(phase 2)
-        punpcklwd mm6,mm1               ; mm6=(00 01 02 03 10 11 12 13)
-        punpckhwd mm4,mm1               ; mm4=(04 05 06 07 14 15 16 17)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 3)
-        punpckldq mm6,mm4               ; mm6=(00 01 02 03 04 05 06 07)
-        punpckhdq mm7,mm4               ; mm7=(10 11 12 13 14 15 16 17)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 2*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 2*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-sse.asm b/media/libjpeg/simd/jidctflt-sse.asm
deleted file mode 100644
index 4d4af2fffc..0000000000
--- a/media/libjpeg/simd/jidctflt-sse.asm
+++ /dev/null
@@ -1,571 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse)
-
-EXTN(jconst_idct_float_sse):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_0_125        times 4 dd  0.125       ; 1/8
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse)
-
-EXTN(jsimd_idct_float_sse):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm1,mm0                       ; mm1=(** 02 ** 03)
-        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in0H=(02 03)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
-        cvtpi2ps  xmm3,mm1                      ; xmm3=(02 03 ** **)
-        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
-        movlhps   xmm0,xmm3                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm4,mm0                       ; mm4=(** 02 ** 03)
-        punpcklwd mm0,mm0                       ; mm0=(00 00 01 01)
-        punpckhwd mm5,mm1                       ; mm5=(** 22 ** 23)
-        punpcklwd mm1,mm1                       ; mm1=(20 20 21 21)
-
-        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in0H=(02 03)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in0L=(00 01)
-        cvtpi2ps  xmm4,mm4                      ; xmm4=(02 03 ** **)
-        cvtpi2ps  xmm0,mm0                      ; xmm0=(00 01 ** **)
-        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in2H=(22 23)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in2L=(20 21)
-        cvtpi2ps  xmm5,mm5                      ; xmm5=(22 23 ** **)
-        cvtpi2ps  xmm1,mm1                      ; xmm1=(20 21 ** **)
-
-        punpckhwd mm6,mm2                       ; mm6=(** 42 ** 43)
-        punpcklwd mm2,mm2                       ; mm2=(40 40 41 41)
-        punpckhwd mm7,mm3                       ; mm7=(** 62 ** 63)
-        punpcklwd mm3,mm3                       ; mm3=(60 60 61 61)
-
-        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in4H=(42 43)
-        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in4L=(40 41)
-        cvtpi2ps  xmm6,mm6                      ; xmm6=(42 43 ** **)
-        cvtpi2ps  xmm2,mm2                      ; xmm2=(40 41 ** **)
-        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in6H=(62 63)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in6L=(60 61)
-        cvtpi2ps  xmm7,mm7                      ; xmm7=(62 63 ** **)
-        cvtpi2ps  xmm3,mm3                      ; xmm3=(60 61 ** **)
-
-        movlhps   xmm0,xmm4                     ; xmm0=in0=(00 01 02 03)
-        movlhps   xmm1,xmm5                     ; xmm1=in2=(20 21 22 23)
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movlhps   xmm2,xmm6                     ; xmm2=in4=(40 41 42 43)
-        movlhps   xmm3,xmm7                     ; xmm3=in6=(60 61 62 63)
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpckhwd mm6,mm4                       ; mm6=(** 12 ** 13)
-        punpcklwd mm4,mm4                       ; mm4=(10 10 11 11)
-        punpckhwd mm2,mm0                       ; mm2=(** 32 ** 33)
-        punpcklwd mm0,mm0                       ; mm0=(30 30 31 31)
-
-        psrad     mm6,(DWORD_BIT-WORD_BIT)      ; mm6=in1H=(12 13)
-        psrad     mm4,(DWORD_BIT-WORD_BIT)      ; mm4=in1L=(10 11)
-        cvtpi2ps  xmm4,mm6                      ; xmm4=(12 13 ** **)
-        cvtpi2ps  xmm2,mm4                      ; xmm2=(10 11 ** **)
-        psrad     mm2,(DWORD_BIT-WORD_BIT)      ; mm2=in3H=(32 33)
-        psrad     mm0,(DWORD_BIT-WORD_BIT)      ; mm0=in3L=(30 31)
-        cvtpi2ps  xmm0,mm2                      ; xmm0=(32 33 ** **)
-        cvtpi2ps  xmm3,mm0                      ; xmm3=(30 31 ** **)
-
-        punpckhwd mm7,mm5                       ; mm7=(** 52 ** 53)
-        punpcklwd mm5,mm5                       ; mm5=(50 50 51 51)
-        punpckhwd mm3,mm1                       ; mm3=(** 72 ** 73)
-        punpcklwd mm1,mm1                       ; mm1=(70 70 71 71)
-
-        movlhps   xmm2,xmm4                     ; xmm2=in1=(10 11 12 13)
-        movlhps   xmm3,xmm0                     ; xmm3=in3=(30 31 32 33)
-
-        psrad     mm7,(DWORD_BIT-WORD_BIT)      ; mm7=in5H=(52 53)
-        psrad     mm5,(DWORD_BIT-WORD_BIT)      ; mm5=in5L=(50 51)
-        cvtpi2ps  xmm4,mm7                      ; xmm4=(52 53 ** **)
-        cvtpi2ps  xmm5,mm5                      ; xmm5=(50 51 ** **)
-        psrad     mm3,(DWORD_BIT-WORD_BIT)      ; mm3=in7H=(72 73)
-        psrad     mm1,(DWORD_BIT-WORD_BIT)      ; mm1=in7L=(70 71)
-        cvtpi2ps  xmm0,mm3                      ; xmm0=(72 73 ** **)
-        cvtpi2ps  xmm1,mm1                      ; xmm1=(70 71 ** **)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movlhps   xmm5,xmm4                     ; xmm5=in5=(50 51 52 53)
-        movlhps   xmm1,xmm0                     ; xmm1=in7=(70 71 72 73)
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[GOTOFF(ebx,PD_0_125)]     ; xmm1=[PD_0_125]
-
-        mulps   xmm6,xmm1               ; descale(1/8)
-        mulps   xmm7,xmm1               ; descale(1/8)
-        mulps   xmm5,xmm1               ; descale(1/8)
-        mulps   xmm0,xmm1               ; descale(1/8)
-
-        movhlps   xmm3,xmm6
-        movhlps   xmm1,xmm7
-        cvtps2pi  mm0,xmm6              ; round to int32, mm0=data0L=(00 10)
-        cvtps2pi  mm1,xmm7              ; round to int32, mm1=data1L=(01 11)
-        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data0H=(20 30)
-        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data1H=(21 31)
-        packssdw  mm0,mm2               ; mm0=data0=(00 10 20 30)
-        packssdw  mm1,mm3               ; mm1=data1=(01 11 21 31)
-
-        movhlps   xmm6,xmm5
-        movhlps   xmm7,xmm0
-        cvtps2pi  mm4,xmm5              ; round to int32, mm4=data7L=(07 17)
-        cvtps2pi  mm5,xmm0              ; round to int32, mm5=data6L=(06 16)
-        cvtps2pi  mm6,xmm6              ; round to int32, mm6=data7H=(27 37)
-        cvtps2pi  mm7,xmm7              ; round to int32, mm7=data6H=(26 36)
-        packssdw  mm4,mm6               ; mm4=data7=(07 17 27 37)
-        packssdw  mm5,mm7               ; mm5=data6=(06 16 26 36)
-
-        packsswb  mm0,mm5               ; mm0=(00 10 20 30 06 16 26 36)
-        packsswb  mm1,mm4               ; mm1=(01 11 21 31 07 17 27 37)
-
-        movaps  xmm3, XMMWORD [wk(0)]   ; xmm3=tmp2
-        movaps  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movaps  xmm6,[GOTOFF(ebx,PD_0_125)]     ; xmm6=[PD_0_125]
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm5,xmm3
-        movaps  xmm0,xmm1
-        addps   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-        addps   xmm1,xmm4               ; xmm1=data4=(04 14 24 34)
-        subps   xmm5,xmm2               ; xmm5=data5=(05 15 25 35)
-        subps   xmm0,xmm4               ; xmm0=data3=(03 13 23 33)
-
-        mulps   xmm3,xmm6               ; descale(1/8)
-        mulps   xmm1,xmm6               ; descale(1/8)
-        mulps   xmm5,xmm6               ; descale(1/8)
-        mulps   xmm0,xmm6               ; descale(1/8)
-
-        movhlps   xmm7,xmm3
-        movhlps   xmm2,xmm1
-        cvtps2pi  mm2,xmm3              ; round to int32, mm2=data2L=(02 12)
-        cvtps2pi  mm3,xmm1              ; round to int32, mm3=data4L=(04 14)
-        cvtps2pi  mm6,xmm7              ; round to int32, mm6=data2H=(22 32)
-        cvtps2pi  mm7,xmm2              ; round to int32, mm7=data4H=(24 34)
-        packssdw  mm2,mm6               ; mm2=data2=(02 12 22 32)
-        packssdw  mm3,mm7               ; mm3=data4=(04 14 24 34)
-
-        movhlps   xmm4,xmm5
-        movhlps   xmm6,xmm0
-        cvtps2pi  mm5,xmm5              ; round to int32, mm5=data5L=(05 15)
-        cvtps2pi  mm4,xmm0              ; round to int32, mm4=data3L=(03 13)
-        cvtps2pi  mm6,xmm4              ; round to int32, mm6=data5H=(25 35)
-        cvtps2pi  mm7,xmm6              ; round to int32, mm7=data3H=(23 33)
-        packssdw  mm5,mm6               ; mm5=data5=(05 15 25 35)
-        packssdw  mm4,mm7               ; mm4=data3=(03 13 23 33)
-
-        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
-
-        packsswb  mm2,mm3               ; mm2=(02 12 22 32 04 14 24 34)
-        packsswb  mm4,mm5               ; mm4=(03 13 23 33 05 15 25 35)
-
-        paddb     mm0,mm6
-        paddb     mm1,mm6
-        paddb     mm2,mm6
-        paddb     mm4,mm6
-
-        movq      mm7,mm0               ; transpose coefficients(phase 1)
-        punpcklbw mm0,mm1               ; mm0=(00 01 10 11 20 21 30 31)
-        punpckhbw mm7,mm1               ; mm7=(06 07 16 17 26 27 36 37)
-        movq      mm3,mm2               ; transpose coefficients(phase 1)
-        punpcklbw mm2,mm4               ; mm2=(02 03 12 13 22 23 32 33)
-        punpckhbw mm3,mm4               ; mm3=(04 05 14 15 24 25 34 35)
-
-        movq      mm5,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm2               ; mm0=(00 01 02 03 10 11 12 13)
-        punpckhwd mm5,mm2               ; mm5=(20 21 22 23 30 31 32 33)
-        movq      mm6,mm3               ; transpose coefficients(phase 2)
-        punpcklwd mm3,mm7               ; mm3=(04 05 06 07 14 15 16 17)
-        punpckhwd mm6,mm7               ; mm6=(24 25 26 27 34 35 36 37)
-
-        movq      mm1,mm0               ; transpose coefficients(phase 3)
-        punpckldq mm0,mm3               ; mm0=(00 01 02 03 04 05 06 07)
-        punpckhdq mm1,mm3               ; mm1=(10 11 12 13 14 15 16 17)
-        movq      mm4,mm5               ; transpose coefficients(phase 3)
-        punpckldq mm5,mm6               ; mm5=(20 21 22 23 24 25 26 27)
-        punpckhdq mm4,mm6               ; mm4=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-sse2-64.asm b/media/libjpeg/simd/jidctflt-sse2-64.asm
deleted file mode 100644
index bdda05d97c..0000000000
--- a/media/libjpeg/simd/jidctflt-sse2-64.asm
+++ /dev/null
@@ -1,482 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [workspace]
-        collect_args
-        push    rbx
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-        lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     rcx, DCTSIZE/4                          ; ctr
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm2
-        por     xmm3,xmm4
-        por     xmm5,xmm6
-        por     xmm1,xmm3
-        por     xmm5,xmm7
-        por     xmm1,xmm5
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
-
-        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
-        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
-
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[rel PD_1_414]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
-        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
-
-        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
-        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
-        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
-        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
-        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     rcx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-        mov     rcx, DCTSIZE/4                          ; ctr
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[rel PD_1_414]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
-        mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
-        pcmpeqd xmm3,xmm3
-        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
-        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
-        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
-        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
-
-        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm7,xmm1
-        movaps  xmm5,xmm3
-        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
-        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
-        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
-        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
-
-        movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
-        pcmpeqd xmm4,xmm4
-        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
-        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
-        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
-        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
-
-        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-        paddb     xmm6,xmm2
-        paddb     xmm1,xmm2
-
-        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
-        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
-
-        add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     rdi, byte 4*SIZEOF_JSAMPROW
-        dec     rcx                             ; ctr
-        jnz     near .rowloop
-
-        pop     rbx
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctflt-sse2.asm b/media/libjpeg/simd/jidctflt-sse2.asm
deleted file mode 100644
index a15a9c1111..0000000000
--- a/media/libjpeg/simd/jidctflt-sse2.asm
+++ /dev/null
@@ -1,497 +0,0 @@
-;
-; jidctflt.asm - floating-point IDCT (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a floating-point implementation of the inverse DCT
-; (Discrete Cosine Transform). The following code is based directly on
-; the IJG's original jidctflt.c; see the jidctflt.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
-        shufps  %1,%2,0x44
-%endmacro
-
-%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
-        shufps  %1,%2,0xEE
-%endmacro
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_float_sse2)
-
-EXTN(jconst_idct_float_sse2):
-
-PD_1_414        times 4 dd  1.414213562373095048801689
-PD_1_847        times 4 dd  1.847759065022573512256366
-PD_1_082        times 4 dd  1.082392200292393968799446
-PD_M2_613       times 4 dd -2.613125929752753055713286
-PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
-                                        ; FAST_FLOAT workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_float_sse2)
-
-EXTN(jsimd_idct_float_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm2
-        por     xmm3,xmm4
-        por     xmm5,xmm6
-        por     xmm1,xmm3
-        por     xmm5,xmm7
-        por     xmm1,xmm5
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm1,xmm0
-        movaps  xmm2,xmm0
-        movaps  xmm3,xmm0
-
-        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
-        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
-        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
-        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
-        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
-
-        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
-        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
-
-        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
-        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
-        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
-        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
-
-        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
-        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
-        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
-        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
-        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
-
-        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
-        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
-        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
-        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
-        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
-        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
-        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
-        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
-        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
-
-        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
-        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm0,xmm7
-        movaps  xmm3,xmm5
-        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
-        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
-        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
-        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
-
-        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
-        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
-        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
-        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
-        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
-        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
-
-        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
-        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
-        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
-        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
-        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
-        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
-
-        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
-        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
-        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-
-        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
-        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
-        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
-        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
-        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
-        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
-
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
-        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm0
-        movaps  xmm5,xmm1
-        subps   xmm0,xmm2               ; xmm0=tmp11
-        subps   xmm1,xmm3
-        addps   xmm4,xmm2               ; xmm4=tmp10
-        addps   xmm5,xmm3               ; xmm5=tmp13
-
-        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
-        subps   xmm1,xmm5               ; xmm1=tmp12
-
-        movaps  xmm6,xmm4
-        movaps  xmm7,xmm0
-        subps   xmm4,xmm5               ; xmm4=tmp3
-        subps   xmm0,xmm1               ; xmm0=tmp2
-        addps   xmm6,xmm5               ; xmm6=tmp0
-        addps   xmm7,xmm1               ; xmm7=tmp1
-
-        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
-        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
-
-        ; -- Odd part
-
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
-
-        movaps  xmm4,xmm2
-        movaps  xmm0,xmm5
-        addps   xmm2,xmm1               ; xmm2=z11
-        addps   xmm5,xmm3               ; xmm5=z13
-        subps   xmm4,xmm1               ; xmm4=z12
-        subps   xmm0,xmm3               ; xmm0=z10
-
-        movaps  xmm1,xmm2
-        subps   xmm2,xmm5
-        addps   xmm1,xmm5               ; xmm1=tmp7
-
-        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
-
-        movaps  xmm3,xmm0
-        addps   xmm0,xmm4
-        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
-        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
-        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
-        addps   xmm3,xmm0               ; xmm3=tmp12
-        subps   xmm4,xmm0               ; xmm4=tmp10
-
-        ; -- Final output stage
-
-        subps   xmm3,xmm1               ; xmm3=tmp6
-        movaps  xmm5,xmm6
-        movaps  xmm0,xmm7
-        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
-        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
-        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
-        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
-        subps   xmm2,xmm3               ; xmm2=tmp5
-
-        movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
-        pcmpeqd xmm3,xmm3
-        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
-        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
-        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
-        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
-
-        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
-        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
-        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
-        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
-
-        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
-        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
-
-        addps   xmm4,xmm2               ; xmm4=tmp4
-        movaps  xmm7,xmm1
-        movaps  xmm5,xmm3
-        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
-        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
-        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
-        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
-
-        movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
-        pcmpeqd xmm4,xmm4
-        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
-
-        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
-        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
-        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
-        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
-
-        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
-        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
-        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
-        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
-        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
-        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
-
-        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
-        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
-        paddb     xmm6,xmm2
-        paddb     xmm1,xmm2
-
-        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-
-        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
-        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-
-        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctfst-mmx.asm b/media/libjpeg/simd/jidctfst-mmx.asm
deleted file mode 100644
index 6e95bfbcaf..0000000000
--- a/media/libjpeg/simd/jidctfst-mmx.asm
+++ /dev/null
@@ -1,499 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_mmx)
-
-EXTN(jconst_idct_ifast_mmx):
-
-PW_F1414        times 4 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 4 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 4 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 4 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_mmx)
-
-EXTN(jsimd_idct_ifast_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        psubw   mm0,mm2                 ; mm0=tmp11
-        psubw   mm1,mm3
-        paddw   mm4,mm2                 ; mm4=tmp10
-        paddw   mm5,mm3                 ; mm5=tmp13
-
-        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        psubw   mm4,mm5                 ; mm4=tmp3
-        psubw   mm0,mm1                 ; mm0=tmp2
-        paddw   mm6,mm5                 ; mm6=tmp0
-        paddw   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
-        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        psubw   mm2,mm1                 ; mm2=z12
-        psubw   mm5,mm3                 ; mm5=z10
-        paddw   mm4,mm1                 ; mm4=z11
-        paddw   mm0,mm3                 ; mm0=z13
-
-        movq    mm1,mm5                 ; mm1=z10(unscaled)
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
-
-        movq    mm3,mm4
-        psubw   mm4,mm0
-        paddw   mm3,mm0                 ; mm3=tmp7
-
-        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movq    mm0,mm5
-        paddw   mm5,mm2
-        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
-        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   mm0,mm1
-        psubw   mm2,mm5                 ; mm2=tmp10
-        paddw   mm0,mm5                 ; mm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   mm0,mm3                 ; mm0=tmp6
-        movq    mm1,mm6
-        movq    mm5,mm7
-        paddw   mm6,mm3                 ; mm6=data0=(00 01 02 03)
-        paddw   mm7,mm0                 ; mm7=data1=(10 11 12 13)
-        psubw   mm1,mm3                 ; mm1=data7=(70 71 72 73)
-        psubw   mm5,mm0                 ; mm5=data6=(60 61 62 63)
-        psubw   mm4,mm0                 ; mm4=tmp5
-
-        movq      mm3,mm6               ; transpose coefficients(phase 1)
-        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
-        punpckhwd mm3,mm7               ; mm3=(02 12 03 13)
-        movq      mm0,mm5               ; transpose coefficients(phase 1)
-        punpcklwd mm5,mm1               ; mm5=(60 70 61 71)
-        punpckhwd mm0,mm1               ; mm0=(62 72 63 73)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp2
-        movq    mm1, MMWORD [wk(1)]     ; mm1=tmp3
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(60 70 61 71)
-        movq    MMWORD [wk(1)], mm0     ; wk(1)=(62 72 63 73)
-
-        paddw   mm2,mm4                 ; mm2=tmp4
-        movq    mm5,mm7
-        movq    mm0,mm1
-        paddw   mm7,mm4                 ; mm7=data2=(20 21 22 23)
-        paddw   mm1,mm2                 ; mm1=data4=(40 41 42 43)
-        psubw   mm5,mm4                 ; mm5=data5=(50 51 52 53)
-        psubw   mm0,mm2                 ; mm0=data3=(30 31 32 33)
-
-        movq      mm4,mm7               ; transpose coefficients(phase 1)
-        punpcklwd mm7,mm0               ; mm7=(20 30 21 31)
-        punpckhwd mm4,mm0               ; mm4=(22 32 23 33)
-        movq      mm2,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm5               ; mm1=(40 50 41 51)
-        punpckhwd mm2,mm5               ; mm2=(42 52 43 53)
-
-        movq      mm0,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm7               ; mm6=(00 10 20 30)
-        punpckhdq mm0,mm7               ; mm0=(01 11 21 31)
-        movq      mm5,mm3               ; transpose coefficients(phase 2)
-        punpckldq mm3,mm4               ; mm3=(02 12 22 32)
-        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=(60 70 61 71)
-        movq    mm4, MMWORD [wk(1)]     ; mm4=(62 72 63 73)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-        movq      mm6,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm7               ; mm1=(40 50 60 70)
-        punpckhdq mm6,mm7               ; mm6=(41 51 61 71)
-        movq      mm0,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm4               ; mm2=(42 52 62 72)
-        punpckhdq mm0,mm4               ; mm0=(43 53 63 73)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_IFAST_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm4,mm0
-        movq    mm5,mm1
-        psubw   mm0,mm2                 ; mm0=tmp11
-        psubw   mm1,mm3
-        paddw   mm4,mm2                 ; mm4=tmp10
-        paddw   mm5,mm3                 ; mm5=tmp13
-
-        psllw   mm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   mm1,mm5                 ; mm1=tmp12
-
-        movq    mm6,mm4
-        movq    mm7,mm0
-        psubw   mm4,mm5                 ; mm4=tmp3
-        psubw   mm0,mm1                 ; mm0=tmp2
-        paddw   mm6,mm5                 ; mm6=tmp0
-        paddw   mm7,mm1                 ; mm7=tmp1
-
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=tmp3
-        movq    MMWORD [wk(0)], mm0     ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movq    mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm4,mm2
-        movq    mm0,mm5
-        psubw   mm2,mm1                 ; mm2=z12
-        psubw   mm5,mm3                 ; mm5=z10
-        paddw   mm4,mm1                 ; mm4=z11
-        paddw   mm0,mm3                 ; mm0=z13
-
-        movq    mm1,mm5                 ; mm1=z10(unscaled)
-        psllw   mm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   mm5,PRE_MULTIPLY_SCALE_BITS
-
-        movq    mm3,mm4
-        psubw   mm4,mm0
-        paddw   mm3,mm0                 ; mm3=tmp7
-
-        psllw   mm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  mm4,[GOTOFF(ebx,PW_F1414)]      ; mm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movq    mm0,mm5
-        paddw   mm5,mm2
-        pmulhw  mm5,[GOTOFF(ebx,PW_F1847)]      ; mm5=z5
-        pmulhw  mm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  mm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   mm0,mm1
-        psubw   mm2,mm5                 ; mm2=tmp10
-        paddw   mm0,mm5                 ; mm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   mm0,mm3                 ; mm0=tmp6
-        movq    mm1,mm6
-        movq    mm5,mm7
-        paddw   mm6,mm3                 ; mm6=data0=(00 10 20 30)
-        paddw   mm7,mm0                 ; mm7=data1=(01 11 21 31)
-        psraw   mm6,(PASS1_BITS+3)      ; descale
-        psraw   mm7,(PASS1_BITS+3)      ; descale
-        psubw   mm1,mm3                 ; mm1=data7=(07 17 27 37)
-        psubw   mm5,mm0                 ; mm5=data6=(06 16 26 36)
-        psraw   mm1,(PASS1_BITS+3)      ; descale
-        psraw   mm5,(PASS1_BITS+3)      ; descale
-        psubw   mm4,mm0                 ; mm4=tmp5
-
-        packsswb  mm6,mm5               ; mm6=(00 10 20 30 06 16 26 36)
-        packsswb  mm7,mm1               ; mm7=(01 11 21 31 07 17 27 37)
-
-        movq    mm3, MMWORD [wk(0)]     ; mm3=tmp2
-        movq    mm0, MMWORD [wk(1)]     ; mm0=tmp3
-
-        paddw   mm2,mm4                 ; mm2=tmp4
-        movq    mm5,mm3
-        movq    mm1,mm0
-        paddw   mm3,mm4                 ; mm3=data2=(02 12 22 32)
-        paddw   mm0,mm2                 ; mm0=data4=(04 14 24 34)
-        psraw   mm3,(PASS1_BITS+3)      ; descale
-        psraw   mm0,(PASS1_BITS+3)      ; descale
-        psubw   mm5,mm4                 ; mm5=data5=(05 15 25 35)
-        psubw   mm1,mm2                 ; mm1=data3=(03 13 23 33)
-        psraw   mm5,(PASS1_BITS+3)      ; descale
-        psraw   mm1,(PASS1_BITS+3)      ; descale
-
-        movq      mm4,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm4=[PB_CENTERJSAMP]
-
-        packsswb  mm3,mm0               ; mm3=(02 12 22 32 04 14 24 34)
-        packsswb  mm1,mm5               ; mm1=(03 13 23 33 05 15 25 35)
-
-        paddb     mm6,mm4
-        paddb     mm7,mm4
-        paddb     mm3,mm4
-        paddb     mm1,mm4
-
-        movq      mm2,mm6               ; transpose coefficients(phase 1)
-        punpcklbw mm6,mm7               ; mm6=(00 01 10 11 20 21 30 31)
-        punpckhbw mm2,mm7               ; mm2=(06 07 16 17 26 27 36 37)
-        movq      mm0,mm3               ; transpose coefficients(phase 1)
-        punpcklbw mm3,mm1               ; mm3=(02 03 12 13 22 23 32 33)
-        punpckhbw mm0,mm1               ; mm0=(04 05 14 15 24 25 34 35)
-
-        movq      mm5,mm6               ; transpose coefficients(phase 2)
-        punpcklwd mm6,mm3               ; mm6=(00 01 02 03 10 11 12 13)
-        punpckhwd mm5,mm3               ; mm5=(20 21 22 23 30 31 32 33)
-        movq      mm4,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm2               ; mm0=(04 05 06 07 14 15 16 17)
-        punpckhwd mm4,mm2               ; mm4=(24 25 26 27 34 35 36 37)
-
-        movq      mm7,mm6               ; transpose coefficients(phase 3)
-        punpckldq mm6,mm0               ; mm6=(00 01 02 03 04 05 06 07)
-        punpckhdq mm7,mm0               ; mm7=(10 11 12 13 14 15 16 17)
-        movq      mm1,mm5               ; transpose coefficients(phase 3)
-        punpckldq mm5,mm4               ; mm5=(20 21 22 23 24 25 26 27)
-        punpckhdq mm1,mm4               ; mm1=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctfst-sse2-64.asm b/media/libjpeg/simd/jidctfst-sse2-64.asm
deleted file mode 100644
index 48846426d2..0000000000
--- a/media/libjpeg/simd/jidctfst-sse2-64.asm
+++ /dev/null
@@ -1,491 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        psubw   xmm0,xmm2               ; xmm0=tmp11
-        psubw   xmm1,xmm3
-        paddw   xmm4,xmm2               ; xmm4=tmp10
-        paddw   xmm5,xmm3               ; xmm5=tmp13
-
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm1,[rel PW_F1414]
-        psubw   xmm1,xmm5               ; xmm1=tmp12
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm7,xmm0
-        psubw   xmm4,xmm5               ; xmm4=tmp3
-        psubw   xmm0,xmm1               ; xmm0=tmp2
-        paddw   xmm6,xmm5               ; xmm6=tmp0
-        paddw   xmm7,xmm1               ; xmm7=tmp1
-
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm0,xmm5
-        psubw   xmm2,xmm1               ; xmm2=z12
-        psubw   xmm5,xmm3               ; xmm5=z10
-        paddw   xmm4,xmm1               ; xmm4=z11
-        paddw   xmm0,xmm3               ; xmm0=z13
-
-        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm3,xmm4
-        psubw   xmm4,xmm0
-        paddw   xmm3,xmm0               ; xmm3=tmp7
-
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm4,[rel PW_F1414]     ; xmm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm0,xmm5
-        paddw   xmm5,xmm2
-        pmulhw  xmm5,[rel PW_F1847]     ; xmm5=z5
-        pmulhw  xmm0,[rel PW_MF1613]
-        pmulhw  xmm2,[rel PW_F1082]
-        psubw   xmm0,xmm1
-        psubw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm0,xmm5               ; xmm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm0,xmm3               ; xmm0=tmp6
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm7
-        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
-        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
-        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
-        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
-        psubw   xmm4,xmm0               ; xmm4=tmp5
-
-        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
-        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
-
-        paddw   xmm2,xmm4               ; xmm2=tmp4
-        movdqa  xmm5,xmm7
-        movdqa  xmm0,xmm1
-        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
-        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
-        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
-        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
-        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
-
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
-        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
-
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
-        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm0,xmm5
-        psubw   xmm6,xmm1               ; xmm6=tmp11
-        psubw   xmm5,xmm3
-        paddw   xmm2,xmm1               ; xmm2=tmp10
-        paddw   xmm0,xmm3               ; xmm0=tmp13
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[rel PW_F1414]
-        psubw   xmm5,xmm0               ; xmm5=tmp12
-
-        movdqa  xmm1,xmm2
-        movdqa  xmm3,xmm6
-        psubw   xmm2,xmm0               ; xmm2=tmp3
-        psubw   xmm6,xmm5               ; xmm6=tmp2
-        paddw   xmm1,xmm0               ; xmm1=tmp0
-        paddw   xmm3,xmm5               ; xmm3=tmp1
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
-
-        ; -- Odd part
-
-        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-        movdqa  xmm2,xmm0
-        movdqa  xmm6,xmm4
-        psubw   xmm0,xmm7               ; xmm0=z12
-        psubw   xmm4,xmm5               ; xmm4=z10
-        paddw   xmm2,xmm7               ; xmm2=z11
-        paddw   xmm6,xmm5               ; xmm6=z13
-
-        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm5,xmm2
-        psubw   xmm2,xmm6
-        paddw   xmm5,xmm6               ; xmm5=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm2,[rel PW_F1414]     ; xmm2=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm6,xmm4
-        paddw   xmm4,xmm0
-        pmulhw  xmm4,[rel PW_F1847]     ; xmm4=z5
-        pmulhw  xmm6,[rel PW_MF1613]
-        pmulhw  xmm0,[rel PW_F1082]
-        psubw   xmm6,xmm7
-        psubw   xmm0,xmm4               ; xmm0=tmp10
-        paddw   xmm6,xmm4               ; xmm6=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm6,xmm5               ; xmm6=tmp6
-        movdqa  xmm7,xmm1
-        movdqa  xmm4,xmm3
-        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
-        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        psraw   xmm1,(PASS1_BITS+3)     ; descale
-        psraw   xmm3,(PASS1_BITS+3)     ; descale
-        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
-        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psubw   xmm2,xmm6               ; xmm2=tmp5
-
-        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
-
-        paddw   xmm0,xmm2               ; xmm0=tmp4
-        movdqa  xmm4,xmm5
-        movdqa  xmm7,xmm6
-        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
-        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
-        psraw   xmm5,(PASS1_BITS+3)     ; descale
-        psraw   xmm6,(PASS1_BITS+3)     ; descale
-        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
-        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-
-        movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
-
-        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm1,xmm2
-        paddb     xmm3,xmm2
-        paddb     xmm5,xmm2
-        paddb     xmm7,xmm2
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
-        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
-        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
-        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
-        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
-
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctfst-sse2.asm b/media/libjpeg/simd/jidctfst-sse2.asm
deleted file mode 100644
index f591e55f0f..0000000000
--- a/media/libjpeg/simd/jidctfst-sse2.asm
+++ /dev/null
@@ -1,501 +0,0 @@
-;
-; jidctfst.asm - fast integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a fast, not so accurate integer implementation of
-; the inverse DCT (Discrete Cosine Transform). The following code is
-; based directly on the IJG's original jidctfst.c; see the jidctfst.c
-; for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      8       ; 14 is also OK.
-%define PASS1_BITS      2
-
-%if IFAST_SCALE_BITS != PASS1_BITS
-%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
-%endif
-
-%if CONST_BITS == 8
-F_1_082 equ     277             ; FIX(1.082392200)
-F_1_414 equ     362             ; FIX(1.414213562)
-F_1_847 equ     473             ; FIX(1.847759065)
-F_2_613 equ     669             ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
-F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
-F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
-; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
-
-%define PRE_MULTIPLY_SCALE_BITS   2
-%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
-
-        alignz  16
-        global  EXTN(jconst_idct_ifast_sse2)
-
-EXTN(jconst_idct_ifast_sse2):
-
-PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
-PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
-PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
-PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_ifast_sse2)
-
-EXTN(jsimd_idct_ifast_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm0
-        movdqa  xmm5,xmm1
-        psubw   xmm0,xmm2               ; xmm0=tmp11
-        psubw   xmm1,xmm3
-        paddw   xmm4,xmm2               ; xmm4=tmp10
-        paddw   xmm5,xmm3               ; xmm5=tmp13
-
-        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm1,[GOTOFF(ebx,PW_F1414)]
-        psubw   xmm1,xmm5               ; xmm1=tmp12
-
-        movdqa  xmm6,xmm4
-        movdqa  xmm7,xmm0
-        psubw   xmm4,xmm5               ; xmm4=tmp3
-        psubw   xmm0,xmm1               ; xmm0=tmp2
-        paddw   xmm6,xmm5               ; xmm6=tmp0
-        paddw   xmm7,xmm1               ; xmm7=tmp1
-
-        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
-        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
-
-        ; -- Odd part
-
-        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
-
-        movdqa  xmm4,xmm2
-        movdqa  xmm0,xmm5
-        psubw   xmm2,xmm1               ; xmm2=z12
-        psubw   xmm5,xmm3               ; xmm5=z10
-        paddw   xmm4,xmm1               ; xmm4=z11
-        paddw   xmm0,xmm3               ; xmm0=z13
-
-        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm3,xmm4
-        psubw   xmm4,xmm0
-        paddw   xmm3,xmm0               ; xmm3=tmp7
-
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F1414)]     ; xmm4=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm0,xmm5
-        paddw   xmm5,xmm2
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F1847)]     ; xmm5=z5
-        pmulhw  xmm0,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F1082)]
-        psubw   xmm0,xmm1
-        psubw   xmm2,xmm5               ; xmm2=tmp10
-        paddw   xmm0,xmm5               ; xmm0=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm0,xmm3               ; xmm0=tmp6
-        movdqa  xmm1,xmm6
-        movdqa  xmm5,xmm7
-        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
-        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
-        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
-        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
-        psubw   xmm4,xmm0               ; xmm4=tmp5
-
-        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
-        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
-        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
-        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
-
-        paddw   xmm2,xmm4               ; xmm2=tmp4
-        movdqa  xmm5,xmm7
-        movdqa  xmm0,xmm1
-        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
-        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
-        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
-        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
-        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
-
-        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
-        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
-        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
-        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
-        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
-        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
-
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
-        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
-
-        movdqa  xmm2,xmm6
-        movdqa  xmm0,xmm5
-        psubw   xmm6,xmm1               ; xmm6=tmp11
-        psubw   xmm5,xmm3
-        paddw   xmm2,xmm1               ; xmm2=tmp10
-        paddw   xmm0,xmm3               ; xmm0=tmp13
-
-        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm5,[GOTOFF(ebx,PW_F1414)]
-        psubw   xmm5,xmm0               ; xmm5=tmp12
-
-        movdqa  xmm1,xmm2
-        movdqa  xmm3,xmm6
-        psubw   xmm2,xmm0               ; xmm2=tmp3
-        psubw   xmm6,xmm5               ; xmm6=tmp2
-        paddw   xmm1,xmm0               ; xmm1=tmp0
-        paddw   xmm3,xmm5               ; xmm3=tmp1
-
-        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
-
-        ; -- Odd part
-
-        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
-
-        movdqa  xmm2,xmm0
-        movdqa  xmm6,xmm4
-        psubw   xmm0,xmm7               ; xmm0=z12
-        psubw   xmm4,xmm5               ; xmm4=z10
-        paddw   xmm2,xmm7               ; xmm2=z11
-        paddw   xmm6,xmm5               ; xmm6=z13
-
-        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
-        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
-        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
-
-        movdqa  xmm5,xmm2
-        psubw   xmm2,xmm6
-        paddw   xmm5,xmm6               ; xmm5=tmp7
-
-        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
-        pmulhw  xmm2,[GOTOFF(ebx,PW_F1414)]     ; xmm2=tmp11
-
-        ; To avoid overflow...
-        ;
-        ; (Original)
-        ; tmp12 = -2.613125930 * z10 + z5;
-        ;
-        ; (This implementation)
-        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
-        ;       = -1.613125930 * z10 - z10 + z5;
-
-        movdqa  xmm6,xmm4
-        paddw   xmm4,xmm0
-        pmulhw  xmm4,[GOTOFF(ebx,PW_F1847)]     ; xmm4=z5
-        pmulhw  xmm6,[GOTOFF(ebx,PW_MF1613)]
-        pmulhw  xmm0,[GOTOFF(ebx,PW_F1082)]
-        psubw   xmm6,xmm7
-        psubw   xmm0,xmm4               ; xmm0=tmp10
-        paddw   xmm6,xmm4               ; xmm6=tmp12
-
-        ; -- Final output stage
-
-        psubw   xmm6,xmm5               ; xmm6=tmp6
-        movdqa  xmm7,xmm1
-        movdqa  xmm4,xmm3
-        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
-        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        psraw   xmm1,(PASS1_BITS+3)     ; descale
-        psraw   xmm3,(PASS1_BITS+3)     ; descale
-        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
-        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psubw   xmm2,xmm6               ; xmm2=tmp5
-
-        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
-        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
-
-        paddw   xmm0,xmm2               ; xmm0=tmp4
-        movdqa  xmm4,xmm5
-        movdqa  xmm7,xmm6
-        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
-        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
-        psraw   xmm5,(PASS1_BITS+3)     ; descale
-        psraw   xmm6,(PASS1_BITS+3)     ; descale
-        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
-        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
-        psraw   xmm4,(PASS1_BITS+3)     ; descale
-        psraw   xmm7,(PASS1_BITS+3)     ; descale
-
-        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
-
-        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm1,xmm2
-        paddb     xmm3,xmm2
-        paddb     xmm5,xmm2
-        paddb     xmm7,xmm2
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
-        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
-        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
-        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
-        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
-        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctint-mmx.asm b/media/libjpeg/simd/jidctint-mmx.asm
deleted file mode 100644
index 5bd198120b..0000000000
--- a/media/libjpeg/simd/jidctint-mmx.asm
+++ /dev/null
@@ -1,851 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_mmx)
-
-EXTN(jconst_idct_islow_mmx):
-
-PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_mmx (void *dct_table, JCOEFPTR coef_block,
-;                       JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          12
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_islow_mmx)
-
-EXTN(jsimd_idct_islow_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm1,mm0
-        packsswb mm1,mm1
-        movd    eax,mm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   mm0,PASS1_BITS
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movq      mm4,mm1               ; mm1=in2=z2
-        movq      mm5,mm1
-        punpcklwd mm4,mm3               ; mm3=in6=z3
-        punpckhwd mm5,mm3
-        movq      mm1,mm4
-        movq      mm3,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
-
-        movq      mm6,mm0
-        paddw     mm0,mm2               ; mm0=in0+in4
-        psubw     mm6,mm2               ; mm6=in0-in4
-
-        pxor      mm7,mm7
-        pxor      mm2,mm2
-        punpcklwd mm7,mm0               ; mm7=tmp0L
-        punpckhwd mm2,mm0               ; mm2=tmp0H
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
-
-        movq    mm0,mm7
-        paddd   mm7,mm4                 ; mm7=tmp10L
-        psubd   mm0,mm4                 ; mm0=tmp13L
-        movq    mm4,mm2
-        paddd   mm2,mm5                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp13H
-
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
-        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
-        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
-
-        pxor      mm5,mm5
-        pxor      mm7,mm7
-        punpcklwd mm5,mm6               ; mm5=tmp1L
-        punpckhwd mm7,mm6               ; mm7=tmp1H
-        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-
-        movq    mm2,mm5
-        paddd   mm5,mm1                 ; mm5=tmp11L
-        psubd   mm2,mm1                 ; mm2=tmp12L
-        movq    mm0,mm7
-        paddd   mm7,mm3                 ; mm7=tmp11H
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
-        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
-        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
-        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm4, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm6, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm1, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movq    mm5,mm6
-        movq    mm7,mm4
-        paddw   mm5,mm3                 ; mm5=z3
-        paddw   mm7,mm1                 ; mm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm2,mm5
-        movq      mm0,mm5
-        punpcklwd mm2,mm7
-        punpckhwd mm0,mm7
-        movq      mm5,mm2
-        movq      mm7,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movq      mm2,mm3
-        movq      mm0,mm3
-        punpcklwd mm2,mm4
-        punpckhwd mm0,mm4
-        movq      mm3,mm2
-        movq      mm4,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
-
-        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
-        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
-        paddd   mm3,mm5                 ; mm3=tmp3L
-        paddd   mm4,mm7                 ; mm4=tmp3H
-
-        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
-        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
-
-        movq      mm2,mm1
-        movq      mm0,mm1
-        punpcklwd mm2,mm6
-        punpckhwd mm0,mm6
-        movq      mm1,mm2
-        movq      mm6,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
-
-        paddd   mm2,mm5                 ; mm2=tmp1L
-        paddd   mm0,mm7                 ; mm0=tmp1H
-        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
-        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
-
-        movq    mm2,mm5
-        movq    mm0,mm7
-        paddd   mm5,mm3                 ; mm5=data0L
-        paddd   mm7,mm4                 ; mm7=data0H
-        psubd   mm2,mm3                 ; mm2=data7L
-        psubd   mm0,mm4                 ; mm0=data7H
-
-        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm3=[PD_DESCALE_P1]
-
-        paddd   mm5,mm3
-        paddd   mm7,mm3
-        psrad   mm5,DESCALE_P1
-        psrad   mm7,DESCALE_P1
-        paddd   mm2,mm3
-        paddd   mm0,mm3
-        psrad   mm2,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-
-        packssdw  mm5,mm7               ; mm5=data0=(00 01 02 03)
-        packssdw  mm2,mm0               ; mm2=data7=(70 71 72 73)
-
-        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
-        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
-
-        movq    mm7,mm4
-        movq    mm0,mm3
-        paddd   mm4,mm1                 ; mm4=data1L
-        paddd   mm3,mm6                 ; mm3=data1H
-        psubd   mm7,mm1                 ; mm7=data6L
-        psubd   mm0,mm6                 ; mm0=data6H
-
-        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm1=[PD_DESCALE_P1]
-
-        paddd   mm4,mm1
-        paddd   mm3,mm1
-        psrad   mm4,DESCALE_P1
-        psrad   mm3,DESCALE_P1
-        paddd   mm7,mm1
-        paddd   mm0,mm1
-        psrad   mm7,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-
-        packssdw  mm4,mm3               ; mm4=data1=(10 11 12 13)
-        packssdw  mm7,mm0               ; mm7=data6=(60 61 62 63)
-
-        movq      mm6,mm5               ; transpose coefficients(phase 1)
-        punpcklwd mm5,mm4               ; mm5=(00 10 01 11)
-        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
-        movq      mm1,mm7               ; transpose coefficients(phase 1)
-        punpcklwd mm7,mm2               ; mm7=(60 70 61 71)
-        punpckhwd mm1,mm2               ; mm1=(62 72 63 73)
-
-        movq    mm3, MMWORD [wk(6)]     ; mm3=tmp12L
-        movq    mm0, MMWORD [wk(7)]     ; mm0=tmp12H
-        movq    mm4, MMWORD [wk(10)]    ; mm4=tmp1L
-        movq    mm2, MMWORD [wk(11)]    ; mm2=tmp1H
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 01 11)
-        movq    MMWORD [wk(1)], mm6     ; wk(1)=(02 12 03 13)
-        movq    MMWORD [wk(4)], mm7     ; wk(4)=(60 70 61 71)
-        movq    MMWORD [wk(5)], mm1     ; wk(5)=(62 72 63 73)
-
-        movq    mm5,mm3
-        movq    mm6,mm0
-        paddd   mm3,mm4                 ; mm3=data2L
-        paddd   mm0,mm2                 ; mm0=data2H
-        psubd   mm5,mm4                 ; mm5=data5L
-        psubd   mm6,mm2                 ; mm6=data5H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm7=[PD_DESCALE_P1]
-
-        paddd   mm3,mm7
-        paddd   mm0,mm7
-        psrad   mm3,DESCALE_P1
-        psrad   mm0,DESCALE_P1
-        paddd   mm5,mm7
-        paddd   mm6,mm7
-        psrad   mm5,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm3,mm0               ; mm3=data2=(20 21 22 23)
-        packssdw  mm5,mm6               ; mm5=data5=(50 51 52 53)
-
-        movq    mm1, MMWORD [wk(2)]     ; mm1=tmp13L
-        movq    mm4, MMWORD [wk(3)]     ; mm4=tmp13H
-        movq    mm2, MMWORD [wk(8)]     ; mm2=tmp0L
-        movq    mm7, MMWORD [wk(9)]     ; mm7=tmp0H
-
-        movq    mm0,mm1
-        movq    mm6,mm4
-        paddd   mm1,mm2                 ; mm1=data3L
-        paddd   mm4,mm7                 ; mm4=data3H
-        psubd   mm0,mm2                 ; mm0=data4L
-        psubd   mm6,mm7                 ; mm6=data4H
-
-        movq    mm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; mm2=[PD_DESCALE_P1]
-
-        paddd   mm1,mm2
-        paddd   mm4,mm2
-        psrad   mm1,DESCALE_P1
-        psrad   mm4,DESCALE_P1
-        paddd   mm0,mm2
-        paddd   mm6,mm2
-        psrad   mm0,DESCALE_P1
-        psrad   mm6,DESCALE_P1
-
-        packssdw  mm1,mm4               ; mm1=data3=(30 31 32 33)
-        packssdw  mm0,mm6               ; mm0=data4=(40 41 42 43)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=(00 10 01 11)
-        movq    mm2, MMWORD [wk(1)]     ; mm2=(02 12 03 13)
-
-        movq      mm4,mm3               ; transpose coefficients(phase 1)
-        punpcklwd mm3,mm1               ; mm3=(20 30 21 31)
-        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
-        movq      mm6,mm0               ; transpose coefficients(phase 1)
-        punpcklwd mm0,mm5               ; mm0=(40 50 41 51)
-        punpckhwd mm6,mm5               ; mm6=(42 52 43 53)
-
-        movq      mm1,mm7               ; transpose coefficients(phase 2)
-        punpckldq mm7,mm3               ; mm7=(00 10 20 30)
-        punpckhdq mm1,mm3               ; mm1=(01 11 21 31)
-        movq      mm5,mm2               ; transpose coefficients(phase 2)
-        punpckldq mm2,mm4               ; mm2=(02 12 22 32)
-        punpckhdq mm5,mm4               ; mm5=(03 13 23 33)
-
-        movq    mm3, MMWORD [wk(4)]     ; mm3=(60 70 61 71)
-        movq    mm4, MMWORD [wk(5)]     ; mm4=(62 72 63 73)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm7
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
-
-        movq      mm7,mm0               ; transpose coefficients(phase 2)
-        punpckldq mm0,mm3               ; mm0=(40 50 60 70)
-        punpckhdq mm7,mm3               ; mm7=(41 51 61 71)
-        movq      mm1,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm4               ; mm6=(42 52 62 72)
-        punpckhdq mm1,mm4               ; mm1=(43 53 63 73)
-
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm7
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm1
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.rowloop:
-
-        ; -- Even part
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movq      mm4,mm1               ; mm1=in2=z2
-        movq      mm5,mm1
-        punpcklwd mm4,mm3               ; mm3=in6=z3
-        punpckhwd mm5,mm3
-        movq      mm1,mm4
-        movq      mm3,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=tmp3L
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F130_F054)]        ; mm5=tmp3H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F054_MF130)]       ; mm3=tmp2H
-
-        movq      mm6,mm0
-        paddw     mm0,mm2               ; mm0=in0+in4
-        psubw     mm6,mm2               ; mm6=in0-in4
-
-        pxor      mm7,mm7
-        pxor      mm2,mm2
-        punpcklwd mm7,mm0               ; mm7=tmp0L
-        punpckhwd mm2,mm0               ; mm2=tmp0H
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-        psrad     mm2,(16-CONST_BITS)   ; psrad mm2,16 & pslld mm2,CONST_BITS
-
-        movq    mm0,mm7
-        paddd   mm7,mm4                 ; mm7=tmp10L
-        psubd   mm0,mm4                 ; mm0=tmp13L
-        movq    mm4,mm2
-        paddd   mm2,mm5                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp13H
-
-        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp10L
-        movq    MMWORD [wk(1)], mm2     ; wk(1)=tmp10H
-        movq    MMWORD [wk(2)], mm0     ; wk(2)=tmp13L
-        movq    MMWORD [wk(3)], mm4     ; wk(3)=tmp13H
-
-        pxor      mm5,mm5
-        pxor      mm7,mm7
-        punpcklwd mm5,mm6               ; mm5=tmp1L
-        punpckhwd mm7,mm6               ; mm7=tmp1H
-        psrad     mm5,(16-CONST_BITS)   ; psrad mm5,16 & pslld mm5,CONST_BITS
-        psrad     mm7,(16-CONST_BITS)   ; psrad mm7,16 & pslld mm7,CONST_BITS
-
-        movq    mm2,mm5
-        paddd   mm5,mm1                 ; mm5=tmp11L
-        psubd   mm2,mm1                 ; mm2=tmp12L
-        movq    mm0,mm7
-        paddd   mm7,mm3                 ; mm7=tmp11H
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        movq    MMWORD [wk(4)], mm5     ; wk(4)=tmp11L
-        movq    MMWORD [wk(5)], mm7     ; wk(5)=tmp11H
-        movq    MMWORD [wk(6)], mm2     ; wk(6)=tmp12L
-        movq    MMWORD [wk(7)], mm0     ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movq    mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm6, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq    mm5,mm6
-        movq    mm7,mm4
-        paddw   mm5,mm3                 ; mm5=z3
-        paddw   mm7,mm1                 ; mm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movq      mm2,mm5
-        movq      mm0,mm5
-        punpcklwd mm2,mm7
-        punpckhwd mm0,mm7
-        movq      mm5,mm2
-        movq      mm7,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF078_F117)]       ; mm2=z3L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF078_F117)]       ; mm0=z3H
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F117_F078)]        ; mm5=z4L
-        pmaddwd   mm7,[GOTOFF(ebx,PW_F117_F078)]        ; mm7=z4H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=z3L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movq      mm2,mm3
-        movq      mm0,mm3
-        punpcklwd mm2,mm4
-        punpckhwd mm0,mm4
-        movq      mm3,mm2
-        movq      mm4,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm2=tmp0L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm0=tmp0H
-        pmaddwd   mm3,[GOTOFF(ebx,PW_MF089_F060)]       ; mm3=tmp3L
-        pmaddwd   mm4,[GOTOFF(ebx,PW_MF089_F060)]       ; mm4=tmp3H
-
-        paddd   mm2, MMWORD [wk(10)]    ; mm2=tmp0L
-        paddd   mm0, MMWORD [wk(11)]    ; mm0=tmp0H
-        paddd   mm3,mm5                 ; mm3=tmp3L
-        paddd   mm4,mm7                 ; mm4=tmp3H
-
-        movq    MMWORD [wk(8)], mm2     ; wk(8)=tmp0L
-        movq    MMWORD [wk(9)], mm0     ; wk(9)=tmp0H
-
-        movq      mm2,mm1
-        movq      mm0,mm1
-        punpcklwd mm2,mm6
-        punpckhwd mm0,mm6
-        movq      mm1,mm2
-        movq      mm6,mm0
-        pmaddwd   mm2,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm2=tmp1L
-        pmaddwd   mm0,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm0=tmp1H
-        pmaddwd   mm1,[GOTOFF(ebx,PW_MF256_F050)]       ; mm1=tmp2L
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF256_F050)]       ; mm6=tmp2H
-
-        paddd   mm2,mm5                 ; mm2=tmp1L
-        paddd   mm0,mm7                 ; mm0=tmp1H
-        paddd   mm1, MMWORD [wk(10)]    ; mm1=tmp2L
-        paddd   mm6, MMWORD [wk(11)]    ; mm6=tmp2H
-
-        movq    MMWORD [wk(10)], mm2    ; wk(10)=tmp1L
-        movq    MMWORD [wk(11)], mm0    ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp10L
-        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp10H
-
-        movq    mm2,mm5
-        movq    mm0,mm7
-        paddd   mm5,mm3                 ; mm5=data0L
-        paddd   mm7,mm4                 ; mm7=data0H
-        psubd   mm2,mm3                 ; mm2=data7L
-        psubd   mm0,mm4                 ; mm0=data7H
-
-        movq    mm3,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm3=[PD_DESCALE_P2]
-
-        paddd   mm5,mm3
-        paddd   mm7,mm3
-        psrad   mm5,DESCALE_P2
-        psrad   mm7,DESCALE_P2
-        paddd   mm2,mm3
-        paddd   mm0,mm3
-        psrad   mm2,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-
-        packssdw  mm5,mm7               ; mm5=data0=(00 10 20 30)
-        packssdw  mm2,mm0               ; mm2=data7=(07 17 27 37)
-
-        movq    mm4, MMWORD [wk(4)]     ; mm4=tmp11L
-        movq    mm3, MMWORD [wk(5)]     ; mm3=tmp11H
-
-        movq    mm7,mm4
-        movq    mm0,mm3
-        paddd   mm4,mm1                 ; mm4=data1L
-        paddd   mm3,mm6                 ; mm3=data1H
-        psubd   mm7,mm1                 ; mm7=data6L
-        psubd   mm0,mm6                 ; mm0=data6H
-
-        movq    mm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm1=[PD_DESCALE_P2]
-
-        paddd   mm4,mm1
-        paddd   mm3,mm1
-        psrad   mm4,DESCALE_P2
-        psrad   mm3,DESCALE_P2
-        paddd   mm7,mm1
-        paddd   mm0,mm1
-        psrad   mm7,DESCALE_P2
-        psrad   mm0,DESCALE_P2
-
-        packssdw  mm4,mm3               ; mm4=data1=(01 11 21 31)
-        packssdw  mm7,mm0               ; mm7=data6=(06 16 26 36)
-
-        packsswb  mm5,mm7               ; mm5=(00 10 20 30 06 16 26 36)
-        packsswb  mm4,mm2               ; mm4=(01 11 21 31 07 17 27 37)
-
-        movq    mm6, MMWORD [wk(6)]     ; mm6=tmp12L
-        movq    mm1, MMWORD [wk(7)]     ; mm1=tmp12H
-        movq    mm3, MMWORD [wk(10)]    ; mm3=tmp1L
-        movq    mm0, MMWORD [wk(11)]    ; mm0=tmp1H
-
-        movq    MMWORD [wk(0)], mm5     ; wk(0)=(00 10 20 30 06 16 26 36)
-        movq    MMWORD [wk(1)], mm4     ; wk(1)=(01 11 21 31 07 17 27 37)
-
-        movq    mm7,mm6
-        movq    mm2,mm1
-        paddd   mm6,mm3                 ; mm6=data2L
-        paddd   mm1,mm0                 ; mm1=data2H
-        psubd   mm7,mm3                 ; mm7=data5L
-        psubd   mm2,mm0                 ; mm2=data5H
-
-        movq    mm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm5=[PD_DESCALE_P2]
-
-        paddd   mm6,mm5
-        paddd   mm1,mm5
-        psrad   mm6,DESCALE_P2
-        psrad   mm1,DESCALE_P2
-        paddd   mm7,mm5
-        paddd   mm2,mm5
-        psrad   mm7,DESCALE_P2
-        psrad   mm2,DESCALE_P2
-
-        packssdw  mm6,mm1               ; mm6=data2=(02 12 22 32)
-        packssdw  mm7,mm2               ; mm7=data5=(05 15 25 35)
-
-        movq    mm4, MMWORD [wk(2)]     ; mm4=tmp13L
-        movq    mm3, MMWORD [wk(3)]     ; mm3=tmp13H
-        movq    mm0, MMWORD [wk(8)]     ; mm0=tmp0L
-        movq    mm5, MMWORD [wk(9)]     ; mm5=tmp0H
-
-        movq    mm1,mm4
-        movq    mm2,mm3
-        paddd   mm4,mm0                 ; mm4=data3L
-        paddd   mm3,mm5                 ; mm3=data3H
-        psubd   mm1,mm0                 ; mm1=data4L
-        psubd   mm2,mm5                 ; mm2=data4H
-
-        movq    mm0,[GOTOFF(ebx,PD_DESCALE_P2)] ; mm0=[PD_DESCALE_P2]
-
-        paddd   mm4,mm0
-        paddd   mm3,mm0
-        psrad   mm4,DESCALE_P2
-        psrad   mm3,DESCALE_P2
-        paddd   mm1,mm0
-        paddd   mm2,mm0
-        psrad   mm1,DESCALE_P2
-        psrad   mm2,DESCALE_P2
-
-        movq      mm5,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm5=[PB_CENTERJSAMP]
-
-        packssdw  mm4,mm3               ; mm4=data3=(03 13 23 33)
-        packssdw  mm1,mm2               ; mm1=data4=(04 14 24 34)
-
-        movq      mm0, MMWORD [wk(0)]   ; mm0=(00 10 20 30 06 16 26 36)
-        movq      mm3, MMWORD [wk(1)]   ; mm3=(01 11 21 31 07 17 27 37)
-
-        packsswb  mm6,mm1               ; mm6=(02 12 22 32 04 14 24 34)
-        packsswb  mm4,mm7               ; mm4=(03 13 23 33 05 15 25 35)
-
-        paddb     mm0,mm5
-        paddb     mm3,mm5
-        paddb     mm6,mm5
-        paddb     mm4,mm5
-
-        movq      mm2,mm0               ; transpose coefficients(phase 1)
-        punpcklbw mm0,mm3               ; mm0=(00 01 10 11 20 21 30 31)
-        punpckhbw mm2,mm3               ; mm2=(06 07 16 17 26 27 36 37)
-        movq      mm1,mm6               ; transpose coefficients(phase 1)
-        punpcklbw mm6,mm4               ; mm6=(02 03 12 13 22 23 32 33)
-        punpckhbw mm1,mm4               ; mm1=(04 05 14 15 24 25 34 35)
-
-        movq      mm7,mm0               ; transpose coefficients(phase 2)
-        punpcklwd mm0,mm6               ; mm0=(00 01 02 03 10 11 12 13)
-        punpckhwd mm7,mm6               ; mm7=(20 21 22 23 30 31 32 33)
-        movq      mm5,mm1               ; transpose coefficients(phase 2)
-        punpcklwd mm1,mm2               ; mm1=(04 05 06 07 14 15 16 17)
-        punpckhwd mm5,mm2               ; mm5=(24 25 26 27 34 35 36 37)
-
-        movq      mm3,mm0               ; transpose coefficients(phase 3)
-        punpckldq mm0,mm1               ; mm0=(00 01 02 03 04 05 06 07)
-        punpckhdq mm3,mm1               ; mm3=(10 11 12 13 14 15 16 17)
-        movq      mm4,mm7               ; transpose coefficients(phase 3)
-        punpckldq mm7,mm5               ; mm7=(20 21 22 23 24 25 26 27)
-        punpckhdq mm4,mm5               ; mm4=(30 31 32 33 34 35 36 37)
-
-        pushpic ebx                     ; save GOT address
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm3
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    MMWORD [edx+eax*SIZEOF_JSAMPLE], mm7
-        movq    MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
-
-        poppic  ebx                     ; restore GOT address
-
-        add     esi, byte 4*SIZEOF_JCOEF        ; wsptr
-        add     edi, byte 4*SIZEOF_JSAMPROW
-        dec     ecx                             ; ctr
-        jnz     near .rowloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctint-sse2-64.asm b/media/libjpeg/simd/jidctint-sse2-64.asm
deleted file mode 100644
index afe1d6a73b..0000000000
--- a/media/libjpeg/simd/jidctint-sse2-64.asm
+++ /dev/null
@@ -1,847 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = jpeg_component_info *compptr
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          12
-
-        align   16
-        global  EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm5,PASS1_BITS
-
-        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm4,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
-        punpckhwd xmm5,xmm3
-        movdqa    xmm1,xmm4
-        movdqa    xmm3,xmm5
-        pmaddwd   xmm4,[rel PW_F130_F054]       ; xmm4=tmp3L
-        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
-        pmaddwd   xmm3,[rel PW_F054_MF130]      ; xmm3=tmp2H
-
-        movdqa    xmm6,xmm0
-        paddw     xmm0,xmm2             ; xmm0=in0+in4
-        psubw     xmm6,xmm2             ; xmm6=in0-in4
-
-        pxor      xmm7,xmm7
-        pxor      xmm2,xmm2
-        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
-        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm4               ; xmm7=tmp10L
-        psubd   xmm0,xmm4               ; xmm0=tmp13L
-        movdqa  xmm4,xmm2
-        paddd   xmm2,xmm5               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm7,xmm7
-        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
-        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-        movdqa  xmm2,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm2,xmm1               ; xmm2=tmp12L
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm3               ; xmm7=tmp11H
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm7,xmm4
-        paddw   xmm5,xmm3               ; xmm5=z3
-        paddw   xmm7,xmm1               ; xmm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm2,xmm5
-        movdqa    xmm0,xmm5
-        punpcklwd xmm2,xmm7
-        punpckhwd xmm0,xmm7
-        movdqa    xmm5,xmm2
-        movdqa    xmm7,xmm0
-        pmaddwd   xmm2,[rel PW_MF078_F117]      ; xmm2=z3L
-        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3H
-        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
-        pmaddwd   xmm7,[rel PW_F117_F078]       ; xmm7=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm2,xmm3
-        movdqa    xmm0,xmm3
-        punpcklwd xmm2,xmm4
-        punpckhwd xmm0,xmm4
-        movdqa    xmm3,xmm2
-        movdqa    xmm4,xmm0
-        pmaddwd   xmm2,[rel PW_MF060_MF089]     ; xmm2=tmp0L
-        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0H
-        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3L
-        pmaddwd   xmm4,[rel PW_MF089_F060]      ; xmm4=tmp3H
-
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
-        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
-        paddd   xmm3,xmm5               ; xmm3=tmp3L
-        paddd   xmm4,xmm7               ; xmm4=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
-
-        movdqa    xmm2,xmm1
-        movdqa    xmm0,xmm1
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm0,xmm6
-        movdqa    xmm1,xmm2
-        movdqa    xmm6,xmm0
-        pmaddwd   xmm2,[rel PW_MF050_MF256]     ; xmm2=tmp1L
-        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1H
-        pmaddwd   xmm1,[rel PW_MF256_F050]      ; xmm1=tmp2L
-        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
-
-        paddd   xmm2,xmm5               ; xmm2=tmp1L
-        paddd   xmm0,xmm7               ; xmm0=tmp1H
-        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm0,xmm7
-        paddd   xmm5,xmm3               ; xmm5=data0L
-        paddd   xmm7,xmm4               ; xmm7=data0H
-        psubd   xmm2,xmm3               ; xmm2=data7L
-        psubd   xmm0,xmm4               ; xmm0=data7H
-
-        movdqa  xmm3,[rel PD_DESCALE_P1]        ; xmm3=[rel PD_DESCALE_P1]
-
-        paddd   xmm5,xmm3
-        paddd   xmm7,xmm3
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm7,DESCALE_P1
-        paddd   xmm2,xmm3
-        paddd   xmm0,xmm3
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
-        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
-
-        movdqa  xmm7,xmm4
-        movdqa  xmm0,xmm3
-        paddd   xmm4,xmm1               ; xmm4=data1L
-        paddd   xmm3,xmm6               ; xmm3=data1H
-        psubd   xmm7,xmm1               ; xmm7=data6L
-        psubd   xmm0,xmm6               ; xmm0=data6H
-
-        movdqa  xmm1,[rel PD_DESCALE_P1]        ; xmm1=[rel PD_DESCALE_P1]
-
-        paddd   xmm4,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-        paddd   xmm7,xmm1
-        paddd   xmm0,xmm1
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
-        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
-        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
-        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm5,xmm3
-        movdqa  xmm6,xmm0
-        paddd   xmm3,xmm4               ; xmm3=data2L
-        paddd   xmm0,xmm2               ; xmm0=data2H
-        psubd   xmm5,xmm4               ; xmm5=data5L
-        psubd   xmm6,xmm2               ; xmm6=data5H
-
-        movdqa  xmm7,[rel PD_DESCALE_P1]        ; xmm7=[rel PD_DESCALE_P1]
-
-        paddd   xmm3,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm3,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-        paddd   xmm5,xmm7
-        paddd   xmm6,xmm7
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
-        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
-        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
-        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
-        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
-
-        movdqa  xmm0,xmm1
-        movdqa  xmm6,xmm4
-        paddd   xmm1,xmm2               ; xmm1=data3L
-        paddd   xmm4,xmm7               ; xmm4=data3H
-        psubd   xmm0,xmm2               ; xmm0=data4L
-        psubd   xmm6,xmm7               ; xmm6=data4H
-
-        movdqa  xmm2,[rel PD_DESCALE_P1]        ; xmm2=[rel PD_DESCALE_P1]
-
-        paddd   xmm1,xmm2
-        paddd   xmm4,xmm2
-        psrad   xmm1,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm0,xmm2
-        paddd   xmm6,xmm2
-        psrad   xmm0,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
-        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
-        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
-
-        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
-        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
-        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
-
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm6,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
-        punpckhwd xmm5,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm2,xmm5
-        pmaddwd   xmm6,[rel PW_F130_F054]       ; xmm6=tmp3L
-        pmaddwd   xmm5,[rel PW_F130_F054]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[rel PW_F054_MF130]      ; xmm1=tmp2L
-        pmaddwd   xmm2,[rel PW_F054_MF130]      ; xmm2=tmp2H
-
-        movdqa    xmm3,xmm7
-        paddw     xmm7,xmm0             ; xmm7=in0+in4
-        psubw     xmm3,xmm0             ; xmm3=in0-in4
-
-        pxor      xmm4,xmm4
-        pxor      xmm0,xmm0
-        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
-        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm6               ; xmm4=tmp10L
-        psubd   xmm7,xmm6               ; xmm7=tmp13L
-        movdqa  xmm6,xmm0
-        paddd   xmm0,xmm5               ; xmm0=tmp10H
-        psubd   xmm6,xmm5               ; xmm6=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm4,xmm4
-        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
-        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-        movdqa  xmm0,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm0,xmm1               ; xmm0=tmp12L
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm2               ; xmm4=tmp11H
-        psubd   xmm7,xmm2               ; xmm7=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
-        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
-        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
-        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm4,xmm3
-        paddw   xmm5,xmm1               ; xmm5=z3
-        paddw   xmm4,xmm2               ; xmm4=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm0,xmm5
-        movdqa    xmm7,xmm5
-        punpcklwd xmm0,xmm4
-        punpckhwd xmm7,xmm4
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm7
-        pmaddwd   xmm0,[rel PW_MF078_F117]      ; xmm0=z3L
-        pmaddwd   xmm7,[rel PW_MF078_F117]      ; xmm7=z3H
-        pmaddwd   xmm5,[rel PW_F117_F078]       ; xmm5=z4L
-        pmaddwd   xmm4,[rel PW_F117_F078]       ; xmm4=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm0,xmm1
-        movdqa    xmm7,xmm1
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm1,xmm0
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm0,[rel PW_MF060_MF089]     ; xmm0=tmp0L
-        pmaddwd   xmm7,[rel PW_MF060_MF089]     ; xmm7=tmp0H
-        pmaddwd   xmm1,[rel PW_MF089_F060]      ; xmm1=tmp3L
-        pmaddwd   xmm3,[rel PW_MF089_F060]      ; xmm3=tmp3H
-
-        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
-        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
-        paddd   xmm1,xmm5               ; xmm1=tmp3L
-        paddd   xmm3,xmm4               ; xmm3=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
-
-        movdqa    xmm0,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm0,xmm6
-        punpckhwd xmm7,xmm6
-        movdqa    xmm2,xmm0
-        movdqa    xmm6,xmm7
-        pmaddwd   xmm0,[rel PW_MF050_MF256]     ; xmm0=tmp1L
-        pmaddwd   xmm7,[rel PW_MF050_MF256]     ; xmm7=tmp1H
-        pmaddwd   xmm2,[rel PW_MF256_F050]      ; xmm2=tmp2L
-        pmaddwd   xmm6,[rel PW_MF256_F050]      ; xmm6=tmp2H
-
-        paddd   xmm0,xmm5               ; xmm0=tmp1L
-        paddd   xmm7,xmm4               ; xmm7=tmp1H
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm7,xmm4
-        paddd   xmm5,xmm1               ; xmm5=data0L
-        paddd   xmm4,xmm3               ; xmm4=data0H
-        psubd   xmm0,xmm1               ; xmm0=data7L
-        psubd   xmm7,xmm3               ; xmm7=data7H
-
-        movdqa  xmm1,[rel PD_DESCALE_P2]        ; xmm1=[rel PD_DESCALE_P2]
-
-        paddd   xmm5,xmm1
-        paddd   xmm4,xmm1
-        psrad   xmm5,DESCALE_P2
-        psrad   xmm4,DESCALE_P2
-        paddd   xmm0,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
-        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
-        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm7,xmm1
-        paddd   xmm3,xmm2               ; xmm3=data1L
-        paddd   xmm1,xmm6               ; xmm1=data1H
-        psubd   xmm4,xmm2               ; xmm4=data6L
-        psubd   xmm7,xmm6               ; xmm7=data6H
-
-        movdqa  xmm2,[rel PD_DESCALE_P2]        ; xmm2=[rel PD_DESCALE_P2]
-
-        paddd   xmm3,xmm2
-        paddd   xmm1,xmm2
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm4,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
-        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
-        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm4,xmm6
-        movdqa  xmm0,xmm2
-        paddd   xmm6,xmm1               ; xmm6=data2L
-        paddd   xmm2,xmm7               ; xmm2=data2H
-        psubd   xmm4,xmm1               ; xmm4=data5L
-        psubd   xmm0,xmm7               ; xmm0=data5H
-
-        movdqa  xmm5,[rel PD_DESCALE_P2]        ; xmm5=[rel PD_DESCALE_P2]
-
-        paddd   xmm6,xmm5
-        paddd   xmm2,xmm5
-        psrad   xmm6,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm4,xmm5
-        paddd   xmm0,xmm5
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
-        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
-        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
-        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
-        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
-
-        movdqa  xmm2,xmm3
-        movdqa  xmm0,xmm1
-        paddd   xmm3,xmm7               ; xmm3=data3L
-        paddd   xmm1,xmm5               ; xmm1=data3H
-        psubd   xmm2,xmm7               ; xmm2=data4L
-        psubd   xmm0,xmm5               ; xmm0=data4H
-
-        movdqa  xmm7,[rel PD_DESCALE_P2]        ; xmm7=[rel PD_DESCALE_P2]
-
-        paddd   xmm3,xmm7
-        paddd   xmm1,xmm7
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm2,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm2,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        movdqa    xmm5,[rel PB_CENTERJSAMP]     ; xmm5=[rel PB_CENTERJSAMP]
-
-        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
-        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm7,xmm5
-        paddb     xmm1,xmm5
-        paddb     xmm6,xmm5
-        paddb     xmm3,xmm5
-
-        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
-        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
-        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
-        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
-        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
-        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
-        mov     rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-        mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-        mov     rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
-        movq    XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctint-sse2.asm b/media/libjpeg/simd/jidctint-sse2.asm
deleted file mode 100644
index 6c7e7d9b4f..0000000000
--- a/media/libjpeg/simd/jidctint-sse2.asm
+++ /dev/null
@@ -1,858 +0,0 @@
-;
-; jidctint.asm - accurate integer IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains a slow-but-accurate integer implementation of the
-; inverse DCT (Discrete Cosine Transform). The following code is based
-; directly on the IJG's original jidctint.c; see the jidctint.c for
-; more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
-%define DESCALE_P2      (CONST_BITS+PASS1_BITS+3)
-
-%if CONST_BITS == 13
-F_0_298 equ      2446           ; FIX(0.298631336)
-F_0_390 equ      3196           ; FIX(0.390180644)
-F_0_541 equ      4433           ; FIX(0.541196100)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_175 equ      9633           ; FIX(1.175875602)
-F_1_501 equ     12299           ; FIX(1.501321110)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_1_961 equ     16069           ; FIX(1.961570560)
-F_2_053 equ     16819           ; FIX(2.053119869)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_072 equ     25172           ; FIX(3.072711026)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
-F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
-F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
-F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
-F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_islow_sse2)
-
-EXTN(jconst_idct_islow_sse2):
-
-PW_F130_F054    times 4 dw  (F_0_541+F_0_765), F_0_541
-PW_F054_MF130   times 4 dw  F_0_541, (F_0_541-F_1_847)
-PW_MF078_F117   times 4 dw  (F_1_175-F_1_961), F_1_175
-PW_F117_F078    times 4 dw  F_1_175, (F_1_175-F_0_390)
-PW_MF060_MF089  times 4 dw  (F_0_298-F_0_899),-F_0_899
-PW_MF089_F060   times 4 dw -F_0_899, (F_1_501-F_0_899)
-PW_MF050_MF256  times 4 dw  (F_2_053-F_2_562),-F_2_562
-PW_MF256_F050   times 4 dw -F_2_562, (F_3_072-F_2_562)
-PD_DESCALE_P1   times 4 dd  1 << (DESCALE_P1-1)
-PD_DESCALE_P2   times 4 dd  1 << (DESCALE_P2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients.
-;
-; GLOBAL(void)
-; jsimd_idct_islow_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                        JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          12
-
-        align   16
-        global  EXTN(jsimd_idct_islow_sse2)
-
-EXTN(jsimd_idct_islow_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     near .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm1,xmm0
-        packsswb xmm1,xmm1
-        packsswb xmm1,xmm1
-        movd    eax,xmm1
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm5,PASS1_BITS
-
-        movdqa    xmm4,xmm5             ; xmm5=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm5,xmm5             ; xmm5=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm4,xmm4             ; xmm4=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm7,xmm5,0x00          ; xmm7=col0=(00 00 00 00 00 00 00 00)
-        pshufd  xmm6,xmm5,0x55          ; xmm6=col1=(01 01 01 01 01 01 01 01)
-        pshufd  xmm1,xmm5,0xAA          ; xmm1=col2=(02 02 02 02 02 02 02 02)
-        pshufd  xmm5,xmm5,0xFF          ; xmm5=col3=(03 03 03 03 03 03 03 03)
-        pshufd  xmm0,xmm4,0x00          ; xmm0=col4=(04 04 04 04 04 04 04 04)
-        pshufd  xmm3,xmm4,0x55          ; xmm3=col5=(05 05 05 05 05 05 05 05)
-        pshufd  xmm2,xmm4,0xAA          ; xmm2=col6=(06 06 06 06 06 06 06 06)
-        pshufd  xmm4,xmm4,0xFF          ; xmm4=col7=(07 07 07 07 07 07 07 07)
-
-        movdqa  XMMWORD [wk(8)], xmm6   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm5   ; wk(9)=col3
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Even part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm4,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm4,xmm3             ; xmm3=in6=z3
-        punpckhwd xmm5,xmm3
-        movdqa    xmm1,xmm4
-        movdqa    xmm3,xmm5
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F130_F054)]       ; xmm4=tmp3L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm3=tmp2H
-
-        movdqa    xmm6,xmm0
-        paddw     xmm0,xmm2             ; xmm0=in0+in4
-        psubw     xmm6,xmm2             ; xmm6=in0-in4
-
-        pxor      xmm7,xmm7
-        pxor      xmm2,xmm2
-        punpcklwd xmm7,xmm0             ; xmm7=tmp0L
-        punpckhwd xmm2,xmm0             ; xmm2=tmp0H
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-        psrad     xmm2,(16-CONST_BITS)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS
-
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm4               ; xmm7=tmp10L
-        psubd   xmm0,xmm4               ; xmm0=tmp13L
-        movdqa  xmm4,xmm2
-        paddd   xmm2,xmm5               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm7,xmm7
-        punpcklwd xmm5,xmm6             ; xmm5=tmp1L
-        punpckhwd xmm7,xmm6             ; xmm7=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm7,(16-CONST_BITS)  ; psrad xmm7,16 & pslld xmm7,CONST_BITS
-
-        movdqa  xmm2,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm2,xmm1               ; xmm2=tmp12L
-        movdqa  xmm0,xmm7
-        paddd   xmm7,xmm3               ; xmm7=tmp11H
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm7,xmm4
-        paddw   xmm5,xmm3               ; xmm5=z3
-        paddw   xmm7,xmm1               ; xmm7=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm2,xmm5
-        movdqa    xmm0,xmm5
-        punpcklwd xmm2,xmm7
-        punpckhwd xmm0,xmm7
-        movdqa    xmm5,xmm2
-        movdqa    xmm7,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm2=z3L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F117_F078)]       ; xmm7=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm2,xmm3
-        movdqa    xmm0,xmm3
-        punpcklwd xmm2,xmm4
-        punpckhwd xmm0,xmm4
-        movdqa    xmm3,xmm2
-        movdqa    xmm4,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm2=tmp0L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0H
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm4=tmp3H
-
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
-        paddd   xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
-        paddd   xmm3,xmm5               ; xmm3=tmp3L
-        paddd   xmm4,xmm7               ; xmm4=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
-
-        movdqa    xmm2,xmm1
-        movdqa    xmm0,xmm1
-        punpcklwd xmm2,xmm6
-        punpckhwd xmm0,xmm6
-        movdqa    xmm1,xmm2
-        movdqa    xmm6,xmm0
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm2=tmp1L
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm1=tmp2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
-
-        paddd   xmm2,xmm5               ; xmm2=tmp1L
-        paddd   xmm0,xmm7               ; xmm0=tmp1H
-        paddd   xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
-
-        movdqa  xmm2,xmm5
-        movdqa  xmm0,xmm7
-        paddd   xmm5,xmm3               ; xmm5=data0L
-        paddd   xmm7,xmm4               ; xmm7=data0H
-        psubd   xmm2,xmm3               ; xmm2=data7L
-        psubd   xmm0,xmm4               ; xmm0=data7H
-
-        movdqa  xmm3,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm3=[PD_DESCALE_P1]
-
-        paddd   xmm5,xmm3
-        paddd   xmm7,xmm3
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm7,DESCALE_P1
-        paddd   xmm2,xmm3
-        paddd   xmm0,xmm3
-        psrad   xmm2,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm5,xmm7             ; xmm5=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm2,xmm0             ; xmm2=data7=(70 71 72 73 74 75 76 77)
-
-        movdqa  xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
-        movdqa  xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
-
-        movdqa  xmm7,xmm4
-        movdqa  xmm0,xmm3
-        paddd   xmm4,xmm1               ; xmm4=data1L
-        paddd   xmm3,xmm6               ; xmm3=data1H
-        psubd   xmm7,xmm1               ; xmm7=data6L
-        psubd   xmm0,xmm6               ; xmm0=data6H
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm1=[PD_DESCALE_P1]
-
-        paddd   xmm4,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm4,DESCALE_P1
-        psrad   xmm3,DESCALE_P1
-        paddd   xmm7,xmm1
-        paddd   xmm0,xmm1
-        psrad   xmm7,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-
-        packssdw  xmm4,xmm3             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm7,xmm0             ; xmm7=data6=(60 61 62 63 64 65 66 67)
-
-        movdqa    xmm6,xmm5             ; transpose coefficients(phase 1)
-        punpcklwd xmm5,xmm4             ; xmm5=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4             ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 1)
-        punpcklwd xmm7,xmm2             ; xmm7=(60 70 61 71 62 72 63 73)
-        punpckhwd xmm1,xmm2             ; xmm1=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
-        movdqa  xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
-        movdqa  xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
-        movdqa  xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
-        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
-        movdqa  XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
-        movdqa  XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
-
-        movdqa  xmm5,xmm3
-        movdqa  xmm6,xmm0
-        paddd   xmm3,xmm4               ; xmm3=data2L
-        paddd   xmm0,xmm2               ; xmm0=data2H
-        psubd   xmm5,xmm4               ; xmm5=data5L
-        psubd   xmm6,xmm2               ; xmm6=data5H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm7=[PD_DESCALE_P1]
-
-        paddd   xmm3,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm3,DESCALE_P1
-        psrad   xmm0,DESCALE_P1
-        paddd   xmm5,xmm7
-        paddd   xmm6,xmm7
-        psrad   xmm5,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm3,xmm0             ; xmm3=data2=(20 21 22 23 24 25 26 27)
-        packssdw  xmm5,xmm6             ; xmm5=data5=(50 51 52 53 54 55 56 57)
-
-        movdqa  xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
-        movdqa  xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
-        movdqa  xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
-        movdqa  xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
-
-        movdqa  xmm0,xmm1
-        movdqa  xmm6,xmm4
-        paddd   xmm1,xmm2               ; xmm1=data3L
-        paddd   xmm4,xmm7               ; xmm4=data3H
-        psubd   xmm0,xmm2               ; xmm0=data4L
-        psubd   xmm6,xmm7               ; xmm6=data4H
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1)]        ; xmm2=[PD_DESCALE_P1]
-
-        paddd   xmm1,xmm2
-        paddd   xmm4,xmm2
-        psrad   xmm1,DESCALE_P1
-        psrad   xmm4,DESCALE_P1
-        paddd   xmm0,xmm2
-        paddd   xmm6,xmm2
-        psrad   xmm0,DESCALE_P1
-        psrad   xmm6,DESCALE_P1
-
-        packssdw  xmm1,xmm4             ; xmm1=data3=(30 31 32 33 34 35 36 37)
-        packssdw  xmm0,xmm6             ; xmm0=data4=(40 41 42 43 44 45 46 47)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
-        movdqa  xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
-
-        movdqa    xmm4,xmm3             ; transpose coefficients(phase 1)
-        punpcklwd xmm3,xmm1             ; xmm3=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm4,xmm1             ; xmm4=(24 34 25 35 26 36 27 37)
-        movdqa    xmm6,xmm0             ; transpose coefficients(phase 1)
-        punpcklwd xmm0,xmm5             ; xmm0=(40 50 41 51 42 52 43 53)
-        punpckhwd xmm6,xmm5             ; xmm6=(44 54 45 55 46 56 47 57)
-
-        movdqa    xmm1,xmm7             ; transpose coefficients(phase 2)
-        punpckldq xmm7,xmm3             ; xmm7=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm1,xmm3             ; xmm1=(02 12 22 32 03 13 23 33)
-        movdqa    xmm5,xmm2             ; transpose coefficients(phase 2)
-        punpckldq xmm2,xmm4             ; xmm2=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm5,xmm4             ; xmm5=(06 16 26 36 07 17 27 37)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
-        movdqa  xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
-
-        movdqa  XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
-        movdqa  XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
-
-        movdqa    xmm2,xmm0             ; transpose coefficients(phase 2)
-        punpckldq xmm0,xmm3             ; xmm0=(40 50 60 70 41 51 61 71)
-        punpckhdq xmm2,xmm3             ; xmm2=(42 52 62 72 43 53 63 73)
-        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm4             ; xmm6=(44 54 64 74 45 55 65 75)
-        punpckhdq xmm5,xmm4             ; xmm5=(46 56 66 76 47 57 67 77)
-
-        movdqa     xmm3,xmm7            ; transpose coefficients(phase 3)
-        punpcklqdq xmm7,xmm0            ; xmm7=col0=(00 10 20 30 40 50 60 70)
-        punpckhqdq xmm3,xmm0            ; xmm3=col1=(01 11 21 31 41 51 61 71)
-        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
-        punpcklqdq xmm1,xmm2            ; xmm1=col2=(02 12 22 32 42 52 62 72)
-        punpckhqdq xmm4,xmm2            ; xmm4=col3=(03 13 23 33 43 53 63 73)
-
-        movdqa  xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
-
-        movdqa  XMMWORD [wk(8)], xmm3   ; wk(8)=col1
-        movdqa  XMMWORD [wk(9)], xmm4   ; wk(9)=col3
-
-        movdqa     xmm3,xmm0            ; transpose coefficients(phase 3)
-        punpcklqdq xmm0,xmm6            ; xmm0=col4=(04 14 24 34 44 54 64 74)
-        punpckhqdq xmm3,xmm6            ; xmm3=col5=(05 15 25 35 45 55 65 75)
-        movdqa     xmm4,xmm2            ; transpose coefficients(phase 3)
-        punpcklqdq xmm2,xmm5            ; xmm2=col6=(06 16 26 36 46 56 66 76)
-        punpckhqdq xmm4,xmm5            ; xmm4=col7=(07 17 27 37 47 57 67 77)
-
-        movdqa  XMMWORD [wk(10)], xmm3  ; wk(10)=col5
-        movdqa  XMMWORD [wk(11)], xmm4  ; wk(11)=col7
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
-
-        ; (Original)
-        ; z1 = (z2 + z3) * 0.541196100;
-        ; tmp2 = z1 + z3 * -1.847759065;
-        ; tmp3 = z1 + z2 * 0.765366865;
-        ;
-        ; (This implementation)
-        ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
-        ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
-
-        movdqa    xmm6,xmm1             ; xmm1=in2=z2
-        movdqa    xmm5,xmm1
-        punpcklwd xmm6,xmm2             ; xmm2=in6=z3
-        punpckhwd xmm5,xmm2
-        movdqa    xmm1,xmm6
-        movdqa    xmm2,xmm5
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_F130_F054)]       ; xmm6=tmp3L
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F130_F054)]       ; xmm5=tmp3H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm1=tmp2L
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F054_MF130)]      ; xmm2=tmp2H
-
-        movdqa    xmm3,xmm7
-        paddw     xmm7,xmm0             ; xmm7=in0+in4
-        psubw     xmm3,xmm0             ; xmm3=in0-in4
-
-        pxor      xmm4,xmm4
-        pxor      xmm0,xmm0
-        punpcklwd xmm4,xmm7             ; xmm4=tmp0L
-        punpckhwd xmm0,xmm7             ; xmm0=tmp0H
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-        psrad     xmm0,(16-CONST_BITS)  ; psrad xmm0,16 & pslld xmm0,CONST_BITS
-
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm6               ; xmm4=tmp10L
-        psubd   xmm7,xmm6               ; xmm7=tmp13L
-        movdqa  xmm6,xmm0
-        paddd   xmm0,xmm5               ; xmm0=tmp10H
-        psubd   xmm6,xmm5               ; xmm6=tmp13H
-
-        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
-        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
-        movdqa  XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
-        movdqa  XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
-
-        pxor      xmm5,xmm5
-        pxor      xmm4,xmm4
-        punpcklwd xmm5,xmm3             ; xmm5=tmp1L
-        punpckhwd xmm4,xmm3             ; xmm4=tmp1H
-        psrad     xmm5,(16-CONST_BITS)  ; psrad xmm5,16 & pslld xmm5,CONST_BITS
-        psrad     xmm4,(16-CONST_BITS)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS
-
-        movdqa  xmm0,xmm5
-        paddd   xmm5,xmm1               ; xmm5=tmp11L
-        psubd   xmm0,xmm1               ; xmm0=tmp12L
-        movdqa  xmm7,xmm4
-        paddd   xmm4,xmm2               ; xmm4=tmp11H
-        psubd   xmm7,xmm2               ; xmm7=tmp12H
-
-        movdqa  XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
-        movdqa  XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
-        movdqa  XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
-        movdqa  XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
-
-        ; -- Odd part
-
-        movdqa  xmm6, XMMWORD [wk(9)]   ; xmm6=col3
-        movdqa  xmm3, XMMWORD [wk(8)]   ; xmm3=col1
-        movdqa  xmm1, XMMWORD [wk(11)]  ; xmm1=col7
-        movdqa  xmm2, XMMWORD [wk(10)]  ; xmm2=col5
-
-        movdqa  xmm5,xmm6
-        movdqa  xmm4,xmm3
-        paddw   xmm5,xmm1               ; xmm5=z3
-        paddw   xmm4,xmm2               ; xmm4=z4
-
-        ; (Original)
-        ; z5 = (z3 + z4) * 1.175875602;
-        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
-        ; z3 += z5;  z4 += z5;
-        ;
-        ; (This implementation)
-        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
-        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
-
-        movdqa    xmm0,xmm5
-        movdqa    xmm7,xmm5
-        punpcklwd xmm0,xmm4
-        punpckhwd xmm7,xmm4
-        movdqa    xmm5,xmm0
-        movdqa    xmm4,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm0=z3L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF078_F117)]      ; xmm7=z3H
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F117_F078)]       ; xmm5=z4L
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F117_F078)]       ; xmm4=z4H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
-
-        ; (Original)
-        ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
-        ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
-        ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
-        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
-        ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
-        ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
-        ;
-        ; (This implementation)
-        ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
-        ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
-        ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
-        ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
-        ; tmp0 += z3;  tmp1 += z4;
-        ; tmp2 += z3;  tmp3 += z4;
-
-        movdqa    xmm0,xmm1
-        movdqa    xmm7,xmm1
-        punpcklwd xmm0,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm1,xmm0
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm0=tmp0L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF089)]     ; xmm7=tmp0H
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm1=tmp3L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_MF089_F060)]      ; xmm3=tmp3H
-
-        paddd   xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
-        paddd   xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
-        paddd   xmm1,xmm5               ; xmm1=tmp3L
-        paddd   xmm3,xmm4               ; xmm3=tmp3H
-
-        movdqa  XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
-        movdqa  XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
-
-        movdqa    xmm0,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm0,xmm6
-        punpckhwd xmm7,xmm6
-        movdqa    xmm2,xmm0
-        movdqa    xmm6,xmm7
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm0=tmp1L
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF050_MF256)]     ; xmm7=tmp1H
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm2=tmp2L
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF256_F050)]      ; xmm6=tmp2H
-
-        paddd   xmm0,xmm5               ; xmm0=tmp1L
-        paddd   xmm7,xmm4               ; xmm7=tmp1H
-        paddd   xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
-        paddd   xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
-
-        movdqa  XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
-        movdqa  XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
-
-        ; -- Final output stage
-
-        movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
-        movdqa  xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
-
-        movdqa  xmm0,xmm5
-        movdqa  xmm7,xmm4
-        paddd   xmm5,xmm1               ; xmm5=data0L
-        paddd   xmm4,xmm3               ; xmm4=data0H
-        psubd   xmm0,xmm1               ; xmm0=data7L
-        psubd   xmm7,xmm3               ; xmm7=data7H
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm1=[PD_DESCALE_P2]
-
-        paddd   xmm5,xmm1
-        paddd   xmm4,xmm1
-        psrad   xmm5,DESCALE_P2
-        psrad   xmm4,DESCALE_P2
-        paddd   xmm0,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm0,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm5,xmm4             ; xmm5=data0=(00 10 20 30 40 50 60 70)
-        packssdw  xmm0,xmm7             ; xmm0=data7=(07 17 27 37 47 57 67 77)
-
-        movdqa  xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
-        movdqa  xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
-
-        movdqa  xmm4,xmm3
-        movdqa  xmm7,xmm1
-        paddd   xmm3,xmm2               ; xmm3=data1L
-        paddd   xmm1,xmm6               ; xmm1=data1H
-        psubd   xmm4,xmm2               ; xmm4=data6L
-        psubd   xmm7,xmm6               ; xmm7=data6H
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm2=[PD_DESCALE_P2]
-
-        paddd   xmm3,xmm2
-        paddd   xmm1,xmm2
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm4,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm7,DESCALE_P2
-
-        packssdw  xmm3,xmm1             ; xmm3=data1=(01 11 21 31 41 51 61 71)
-        packssdw  xmm4,xmm7             ; xmm4=data6=(06 16 26 36 46 56 66 76)
-
-        packsswb  xmm5,xmm4             ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        packsswb  xmm3,xmm0             ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
-        movdqa  xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
-        movdqa  xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
-        movdqa  xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
-
-        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        movdqa  xmm4,xmm6
-        movdqa  xmm0,xmm2
-        paddd   xmm6,xmm1               ; xmm6=data2L
-        paddd   xmm2,xmm7               ; xmm2=data2H
-        psubd   xmm4,xmm1               ; xmm4=data5L
-        psubd   xmm0,xmm7               ; xmm0=data5H
-
-        movdqa  xmm5,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm5=[PD_DESCALE_P2]
-
-        paddd   xmm6,xmm5
-        paddd   xmm2,xmm5
-        psrad   xmm6,DESCALE_P2
-        psrad   xmm2,DESCALE_P2
-        paddd   xmm4,xmm5
-        paddd   xmm0,xmm5
-        psrad   xmm4,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        packssdw  xmm6,xmm2             ; xmm6=data2=(02 12 22 32 42 52 62 72)
-        packssdw  xmm4,xmm0             ; xmm4=data5=(05 15 25 35 45 55 65 75)
-
-        movdqa  xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
-        movdqa  xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
-        movdqa  xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
-        movdqa  xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
-
-        movdqa  xmm2,xmm3
-        movdqa  xmm0,xmm1
-        paddd   xmm3,xmm7               ; xmm3=data3L
-        paddd   xmm1,xmm5               ; xmm1=data3H
-        psubd   xmm2,xmm7               ; xmm2=data4L
-        psubd   xmm0,xmm5               ; xmm0=data4H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P2)]        ; xmm7=[PD_DESCALE_P2]
-
-        paddd   xmm3,xmm7
-        paddd   xmm1,xmm7
-        psrad   xmm3,DESCALE_P2
-        psrad   xmm1,DESCALE_P2
-        paddd   xmm2,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm2,DESCALE_P2
-        psrad   xmm0,DESCALE_P2
-
-        movdqa    xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm5=[PB_CENTERJSAMP]
-
-        packssdw  xmm3,xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
-        packssdw  xmm2,xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
-
-        movdqa    xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
-        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
-
-        packsswb  xmm6,xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
-        packsswb  xmm3,xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
-
-        paddb     xmm7,xmm5
-        paddb     xmm1,xmm5
-        paddb     xmm6,xmm5
-        paddb     xmm3,xmm5
-
-        movdqa    xmm0,xmm7     ; transpose coefficients(phase 1)
-        punpcklbw xmm7,xmm1     ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
-        punpckhbw xmm0,xmm1     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
-        movdqa    xmm2,xmm6     ; transpose coefficients(phase 1)
-        punpcklbw xmm6,xmm3     ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
-        punpckhbw xmm2,xmm3     ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
-
-        movdqa    xmm4,xmm7     ; transpose coefficients(phase 2)
-        punpcklwd xmm7,xmm6     ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
-        punpckhwd xmm4,xmm6     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
-        movdqa    xmm5,xmm2     ; transpose coefficients(phase 2)
-        punpcklwd xmm2,xmm0     ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
-        punpckhwd xmm5,xmm0     ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
-
-        movdqa    xmm1,xmm7     ; transpose coefficients(phase 3)
-        punpckldq xmm7,xmm2     ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
-        punpckhdq xmm1,xmm2     ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
-        movdqa    xmm3,xmm4     ; transpose coefficients(phase 3)
-        punpckldq xmm4,xmm5     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
-        punpckhdq xmm3,xmm5     ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
-
-        pshufd  xmm6,xmm7,0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
-        pshufd  xmm0,xmm1,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
-        pshufd  xmm2,xmm4,0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
-        pshufd  xmm5,xmm3,0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1
-        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
-        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
-        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2
-        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctred-mmx.asm b/media/libjpeg/simd/jidctred-mmx.asm
deleted file mode 100644
index ba054e31a0..0000000000
--- a/media/libjpeg/simd/jidctred-mmx.asm
+++ /dev/null
@@ -1,705 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_mmx)
-
-EXTN(jconst_idct_red_mmx):
-
-PW_F184_MF076   times 2 dw  F_1_847,-F_0_765
-PW_F256_F089    times 2 dw  F_2_562, F_0_899
-PW_F106_MF217   times 2 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 2 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 2 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 2 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 2 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 2 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 2 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 2 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 2 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 8 db  CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_mmx (void *dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM          2
-%define workspace       wk(0)-DCTSIZE2*SIZEOF_JCOEF
-                                        ; JCOEF workspace[DCTSIZE2]
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_mmx)
-
-EXTN(jsimd_idct_4x4_mmx):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [workspace]
-        pushpic ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input, store into work array.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-        lea     edi, [workspace]                        ; JCOEF *wsptr
-        mov     ecx, DCTSIZE/4                          ; ctr
-        alignx  16,7
-.columnloop:
-%ifndef NO_ZERO_COLUMN_TEST_4X4_MMX
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     mm0,mm1
-        packsswb mm0,mm0
-        movd    eax,mm0
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   mm0,PASS1_BITS
-
-        movq      mm2,mm0               ; mm0=in0=(00 01 02 03)
-        punpcklwd mm0,mm0               ; mm0=(00 00 01 01)
-        punpckhwd mm2,mm2               ; mm2=(02 02 03 03)
-
-        movq      mm1,mm0
-        punpckldq mm0,mm0               ; mm0=(00 00 00 00)
-        punpckhdq mm1,mm1               ; mm1=(01 01 01 01)
-        movq      mm3,mm2
-        punpckldq mm2,mm2               ; mm2=(02 02 02 02)
-        punpckhdq mm3,mm3               ; mm3=(03 03 03 03)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-        jmp     near .nextcolumn
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movq      mm4,mm0
-        movq      mm5,mm0
-        punpcklwd mm4,mm1
-        punpckhwd mm5,mm1
-        movq      mm0,mm4
-        movq      mm1,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
-
-        movq      mm6,mm2
-        movq      mm7,mm2
-        punpcklwd mm6,mm3
-        punpckhwd mm7,mm3
-        movq      mm2,mm6
-        movq      mm3,mm7
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
-
-        paddd   mm6,mm4                 ; mm6=tmp2L
-        paddd   mm7,mm5                 ; mm7=tmp2H
-        paddd   mm2,mm0                 ; mm2=tmp0L
-        paddd   mm3,mm1                 ; mm3=tmp0H
-
-        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
-        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm4, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm0, MMWORD [MMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      mm1,mm1
-        pxor      mm2,mm2
-        punpcklwd mm1,mm4               ; mm1=tmp0L
-        punpckhwd mm2,mm4               ; mm2=tmp0H
-        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
-        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-        movq      mm3,mm5               ; mm5=in2=z2
-        punpcklwd mm5,mm0               ; mm0=in6=z3
-        punpckhwd mm3,mm0
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
-
-        movq    mm4,mm1
-        movq    mm0,mm2
-        paddd   mm1,mm5                 ; mm1=tmp10L
-        paddd   mm2,mm3                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp12L
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        ; -- Final output stage
-
-        movq    mm5,mm1
-        movq    mm3,mm2
-        paddd   mm1,mm6                 ; mm1=data0L
-        paddd   mm2,mm7                 ; mm2=data0H
-        psubd   mm5,mm6                 ; mm5=data3L
-        psubd   mm3,mm7                 ; mm3=data3H
-
-        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm6=[PD_DESCALE_P1_4]
-
-        paddd   mm1,mm6
-        paddd   mm2,mm6
-        psrad   mm1,DESCALE_P1_4
-        psrad   mm2,DESCALE_P1_4
-        paddd   mm5,mm6
-        paddd   mm3,mm6
-        psrad   mm5,DESCALE_P1_4
-        psrad   mm3,DESCALE_P1_4
-
-        packssdw  mm1,mm2               ; mm1=data0=(00 01 02 03)
-        packssdw  mm5,mm3               ; mm5=data3=(30 31 32 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
-
-        movq    mm2,mm4
-        movq    mm3,mm0
-        paddd   mm4,mm7                 ; mm4=data1L
-        paddd   mm0,mm6                 ; mm0=data1H
-        psubd   mm2,mm7                 ; mm2=data2L
-        psubd   mm3,mm6                 ; mm3=data2H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]       ; mm7=[PD_DESCALE_P1_4]
-
-        paddd   mm4,mm7
-        paddd   mm0,mm7
-        psrad   mm4,DESCALE_P1_4
-        psrad   mm0,DESCALE_P1_4
-        paddd   mm2,mm7
-        paddd   mm3,mm7
-        psrad   mm2,DESCALE_P1_4
-        psrad   mm3,DESCALE_P1_4
-
-        packssdw  mm4,mm0               ; mm4=data1=(10 11 12 13)
-        packssdw  mm2,mm3               ; mm2=data2=(20 21 22 23)
-
-        movq      mm6,mm1               ; transpose coefficients(phase 1)
-        punpcklwd mm1,mm4               ; mm1=(00 10 01 11)
-        punpckhwd mm6,mm4               ; mm6=(02 12 03 13)
-        movq      mm7,mm2               ; transpose coefficients(phase 1)
-        punpcklwd mm2,mm5               ; mm2=(20 30 21 31)
-        punpckhwd mm7,mm5               ; mm7=(22 32 23 33)
-
-        movq      mm0,mm1               ; transpose coefficients(phase 2)
-        punpckldq mm1,mm2               ; mm1=(00 10 20 30)
-        punpckhdq mm0,mm2               ; mm0=(01 11 21 31)
-        movq      mm3,mm6               ; transpose coefficients(phase 2)
-        punpckldq mm6,mm7               ; mm6=(02 12 22 32)
-        punpckhdq mm3,mm7               ; mm3=(03 13 23 33)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
-
-.nextcolumn:
-        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
-        add     edx, byte 4*SIZEOF_ISLOW_MULT_TYPE      ; quantptr
-        add     edi, byte 4*DCTSIZE*SIZEOF_JCOEF        ; wsptr
-        dec     ecx                                     ; ctr
-        jnz     near .columnloop
-
-        ; ---- Pass 2: process rows from work array, store into output array.
-
-        mov     eax, [original_ebp]
-        lea     esi, [workspace]                        ; JCOEF *wsptr
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-
-        movq      mm4,mm0
-        movq      mm5,mm0
-        punpcklwd mm4,mm1
-        punpckhwd mm5,mm1
-        movq      mm0,mm4
-        movq      mm1,mm5
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F256_F089)]        ; mm4=(tmp2L)
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F256_F089)]        ; mm5=(tmp2H)
-        pmaddwd   mm0,[GOTOFF(ebx,PW_F106_MF217)]       ; mm0=(tmp0L)
-        pmaddwd   mm1,[GOTOFF(ebx,PW_F106_MF217)]       ; mm1=(tmp0H)
-
-        movq      mm6,mm2
-        movq      mm7,mm2
-        punpcklwd mm6,mm3
-        punpckhwd mm7,mm3
-        movq      mm2,mm6
-        movq      mm3,mm7
-        pmaddwd   mm6,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm6=(tmp2L)
-        pmaddwd   mm7,[GOTOFF(ebx,PW_MF060_MF050)]      ; mm7=(tmp2H)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F145_MF021)]       ; mm2=(tmp0L)
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F145_MF021)]       ; mm3=(tmp0H)
-
-        paddd   mm6,mm4                 ; mm6=tmp2L
-        paddd   mm7,mm5                 ; mm7=tmp2H
-        paddd   mm2,mm0                 ; mm2=tmp0L
-        paddd   mm3,mm1                 ; mm3=tmp0H
-
-        movq    MMWORD [wk(0)], mm2     ; wk(0)=tmp0L
-        movq    MMWORD [wk(1)], mm3     ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movq    mm4, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movq    mm0, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-
-        pxor      mm1,mm1
-        pxor      mm2,mm2
-        punpcklwd mm1,mm4               ; mm1=tmp0L
-        punpckhwd mm2,mm4               ; mm2=tmp0H
-        psrad     mm1,(16-CONST_BITS-1) ; psrad mm1,16 & pslld mm1,CONST_BITS+1
-        psrad     mm2,(16-CONST_BITS-1) ; psrad mm2,16 & pslld mm2,CONST_BITS+1
-
-        movq      mm3,mm5               ; mm5=in2=z2
-        punpcklwd mm5,mm0               ; mm0=in6=z3
-        punpckhwd mm3,mm0
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F184_MF076)]       ; mm5=tmp2L
-        pmaddwd   mm3,[GOTOFF(ebx,PW_F184_MF076)]       ; mm3=tmp2H
-
-        movq    mm4,mm1
-        movq    mm0,mm2
-        paddd   mm1,mm5                 ; mm1=tmp10L
-        paddd   mm2,mm3                 ; mm2=tmp10H
-        psubd   mm4,mm5                 ; mm4=tmp12L
-        psubd   mm0,mm3                 ; mm0=tmp12H
-
-        ; -- Final output stage
-
-        movq    mm5,mm1
-        movq    mm3,mm2
-        paddd   mm1,mm6                 ; mm1=data0L
-        paddd   mm2,mm7                 ; mm2=data0H
-        psubd   mm5,mm6                 ; mm5=data3L
-        psubd   mm3,mm7                 ; mm3=data3H
-
-        movq    mm6,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm6=[PD_DESCALE_P2_4]
-
-        paddd   mm1,mm6
-        paddd   mm2,mm6
-        psrad   mm1,DESCALE_P2_4
-        psrad   mm2,DESCALE_P2_4
-        paddd   mm5,mm6
-        paddd   mm3,mm6
-        psrad   mm5,DESCALE_P2_4
-        psrad   mm3,DESCALE_P2_4
-
-        packssdw  mm1,mm2               ; mm1=data0=(00 10 20 30)
-        packssdw  mm5,mm3               ; mm5=data3=(03 13 23 33)
-
-        movq    mm7, MMWORD [wk(0)]     ; mm7=tmp0L
-        movq    mm6, MMWORD [wk(1)]     ; mm6=tmp0H
-
-        movq    mm2,mm4
-        movq    mm3,mm0
-        paddd   mm4,mm7                 ; mm4=data1L
-        paddd   mm0,mm6                 ; mm0=data1H
-        psubd   mm2,mm7                 ; mm2=data2L
-        psubd   mm3,mm6                 ; mm3=data2H
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P2_4)]       ; mm7=[PD_DESCALE_P2_4]
-
-        paddd   mm4,mm7
-        paddd   mm0,mm7
-        psrad   mm4,DESCALE_P2_4
-        psrad   mm0,DESCALE_P2_4
-        paddd   mm2,mm7
-        paddd   mm3,mm7
-        psrad   mm2,DESCALE_P2_4
-        psrad   mm3,DESCALE_P2_4
-
-        packssdw  mm4,mm0               ; mm4=data1=(01 11 21 31)
-        packssdw  mm2,mm3               ; mm2=data2=(02 12 22 32)
-
-        movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]      ; mm6=[PB_CENTERJSAMP]
-
-        packsswb  mm1,mm2               ; mm1=(00 10 20 30 02 12 22 32)
-        packsswb  mm4,mm5               ; mm4=(01 11 21 31 03 13 23 33)
-        paddb     mm1,mm6
-        paddb     mm4,mm6
-
-        movq      mm7,mm1               ; transpose coefficients(phase 1)
-        punpcklbw mm1,mm4               ; mm1=(00 01 10 11 20 21 30 31)
-        punpckhbw mm7,mm4               ; mm7=(02 03 12 13 22 23 32 33)
-
-        movq      mm0,mm1               ; transpose coefficients(phase 2)
-        punpcklwd mm1,mm7               ; mm1=(00 01 02 03 10 11 12 13)
-        punpckhwd mm0,mm7               ; mm0=(20 21 22 23 30 31 32 33)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-        psrlq   mm1,4*BYTE_BIT
-        psrlq   mm0,4*BYTE_BIT
-
-        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movd    DWORD [edx+eax*SIZEOF_JSAMPLE], mm1
-        movd    DWORD [esi+eax*SIZEOF_JSAMPLE], mm0
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_mmx (void *dct_table, JCOEFPTR coef_block,
-;                     JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_mmx)
-
-EXTN(jsimd_idct_2x2_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movq    mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm0, MMWORD [MMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm2, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movq    mm3, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  mm2, MMWORD [MMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm0=(10 11 ** 13), mm1=(30 31 ** 33)
-        ; mm2=(50 51 ** 53), mm3=(70 71 ** 73)
-
-        pcmpeqd   mm7,mm7
-        pslld     mm7,WORD_BIT          ; mm7={0x0000 0xFFFF 0x0000 0xFFFF}
-
-        movq      mm4,mm0               ; mm4=(10 11 ** 13)
-        movq      mm5,mm2               ; mm5=(50 51 ** 53)
-        punpcklwd mm4,mm1               ; mm4=(10 30 11 31)
-        punpcklwd mm5,mm3               ; mm5=(50 70 51 71)
-        pmaddwd   mm4,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        psrld   mm0,WORD_BIT            ; mm0=(11 -- 13 --)
-        pand    mm1,mm7                 ; mm1=(-- 31 -- 33)
-        psrld   mm2,WORD_BIT            ; mm2=(51 -- 53 --)
-        pand    mm3,mm7                 ; mm3=(-- 71 -- 73)
-        por     mm0,mm1                 ; mm0=(11 31 13 33)
-        por     mm2,mm3                 ; mm2=(51 71 53 73)
-        pmaddwd mm0,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd mm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   mm4,mm5                 ; mm4=tmp0[col0 col1]
-
-        movq    mm6, MMWORD [MMBLOCK(1,1,esi,SIZEOF_JCOEF)]
-        movq    mm1, MMWORD [MMBLOCK(3,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm6, MMWORD [MMBLOCK(1,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm1, MMWORD [MMBLOCK(3,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movq    mm3, MMWORD [MMBLOCK(5,1,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(7,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm3, MMWORD [MMBLOCK(5,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(7,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm6=(** 15 ** 17), mm1=(** 35 ** 37)
-        ; mm3=(** 55 ** 57), mm5=(** 75 ** 77)
-
-        psrld   mm6,WORD_BIT            ; mm6=(15 -- 17 --)
-        pand    mm1,mm7                 ; mm1=(-- 35 -- 37)
-        psrld   mm3,WORD_BIT            ; mm3=(55 -- 57 --)
-        pand    mm5,mm7                 ; mm5=(-- 75 -- 77)
-        por     mm6,mm1                 ; mm6=(15 35 17 37)
-        por     mm3,mm5                 ; mm3=(55 75 57 77)
-        pmaddwd mm6,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd mm3,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   mm0,mm2                 ; mm0=tmp0[col1 col3]
-        paddd   mm6,mm3                 ; mm6=tmp0[col5 col7]
-
-        ; -- Even part
-
-        movq    mm1, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movq    mm5, MMWORD [MMBLOCK(0,1,esi,SIZEOF_JCOEF)]
-        pmullw  mm1, MMWORD [MMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  mm5, MMWORD [MMBLOCK(0,1,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; mm1=(00 01 ** 03), mm5=(** 05 ** 07)
-
-        movq    mm2,mm1                         ; mm2=(00 01 ** 03)
-        pslld   mm1,WORD_BIT                    ; mm1=(-- 00 -- **)
-        psrad   mm1,(WORD_BIT-CONST_BITS-2)     ; mm1=tmp10[col0 ****]
-
-        pand    mm2,mm7                         ; mm2=(-- 01 -- 03)
-        pand    mm5,mm7                         ; mm5=(-- 05 -- 07)
-        psrad   mm2,(WORD_BIT-CONST_BITS-2)     ; mm2=tmp10[col1 col3]
-        psrad   mm5,(WORD_BIT-CONST_BITS-2)     ; mm5=tmp10[col5 col7]
-
-        ; -- Final output stage
-
-        movq      mm3,mm1
-        paddd     mm1,mm4               ; mm1=data0[col0 ****]=(A0 **)
-        psubd     mm3,mm4               ; mm3=data1[col0 ****]=(B0 **)
-        punpckldq mm1,mm3               ; mm1=(A0 B0)
-
-        movq    mm7,[GOTOFF(ebx,PD_DESCALE_P1_2)]       ; mm7=[PD_DESCALE_P1_2]
-
-        movq    mm4,mm2
-        movq    mm3,mm5
-        paddd   mm2,mm0                 ; mm2=data0[col1 col3]=(A1 A3)
-        paddd   mm5,mm6                 ; mm5=data0[col5 col7]=(A5 A7)
-        psubd   mm4,mm0                 ; mm4=data1[col1 col3]=(B1 B3)
-        psubd   mm3,mm6                 ; mm3=data1[col5 col7]=(B5 B7)
-
-        paddd   mm1,mm7
-        psrad   mm1,DESCALE_P1_2
-
-        paddd   mm2,mm7
-        paddd   mm5,mm7
-        psrad   mm2,DESCALE_P1_2
-        psrad   mm5,DESCALE_P1_2
-        paddd   mm4,mm7
-        paddd   mm3,mm7
-        psrad   mm4,DESCALE_P1_2
-        psrad   mm3,DESCALE_P1_2
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(ebp)]
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  mm2,mm4               ; mm2=(A1 A3 B1 B3)
-        packssdw  mm5,mm3               ; mm5=(A5 A7 B5 B7)
-        pmaddwd   mm2,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   mm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd     mm2,mm5               ; mm2=tmp0[row0 row1]
-
-        ; -- Even part
-
-        pslld     mm1,(CONST_BITS+2)    ; mm1=tmp10[row0 row1]
-
-        ; -- Final output stage
-
-        movq      mm0,[GOTOFF(ebx,PD_DESCALE_P2_2)]     ; mm0=[PD_DESCALE_P2_2]
-
-        movq      mm6,mm1
-        paddd     mm1,mm2               ; mm1=data0[row0 row1]=(C0 C1)
-        psubd     mm6,mm2               ; mm6=data1[row0 row1]=(D0 D1)
-
-        paddd     mm1,mm0
-        paddd     mm6,mm0
-        psrad     mm1,DESCALE_P2_2
-        psrad     mm6,DESCALE_P2_2
-
-        movq      mm7,mm1               ; transpose coefficients
-        punpckldq mm1,mm6               ; mm1=(C0 D0)
-        punpckhdq mm7,mm6               ; mm7=(C1 D1)
-
-        packssdw  mm1,mm7               ; mm1=(C0 D0 C1 D1)
-        packsswb  mm1,mm1               ; mm1=(C0 D0 C1 D1 C0 D0 C1 D1)
-        paddb     mm1,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        movd    ecx,mm1
-        movd    ebx,mm1                 ; ebx=(C0 D0 C1 D1)
-        shr     ecx,2*BYTE_BIT          ; ecx=(C1 D1 -- --)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
-        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctred-sse2-64.asm b/media/libjpeg/simd/jidctred-sse2-64.asm
deleted file mode 100644
index a54bbe24e0..0000000000
--- a/media/libjpeg/simd/jidctred-sse2-64.asm
+++ /dev/null
@@ -1,575 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
-PW_F256_F089    times 4 dw  F_2_562, F_0_899
-PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-%define original_rbp    rbp+0
-%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
-        push    rbp
-        mov     rax,rsp                         ; rax = original rbp
-        sub     rsp, byte 4
-        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [rsp],rax
-        mov     rbp,rsp                         ; rbp = aligned rbp
-        lea     rsp, [wk(0)]
-        collect_args
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        por     xmm0,xmm1
-        packsswb xmm0,xmm0
-        packsswb xmm0,xmm0
-        movd    eax,xmm0
-        test    rax,rax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm0,PASS1_BITS
-
-        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-        jmp     near .column_end
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm5,xmm0
-        punpcklwd xmm4,xmm1
-        punpckhwd xmm5,xmm1
-        movdqa    xmm0,xmm4
-        movdqa    xmm1,xmm5
-        pmaddwd   xmm4,[rel PW_F256_F089]       ; xmm4=(tmp2L)
-        pmaddwd   xmm5,[rel PW_F256_F089]       ; xmm5=(tmp2H)
-        pmaddwd   xmm0,[rel PW_F106_MF217]      ; xmm0=(tmp0L)
-        pmaddwd   xmm1,[rel PW_F106_MF217]      ; xmm1=(tmp0H)
-
-        movdqa    xmm6,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm6,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm2,xmm6
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2L)
-        pmaddwd   xmm7,[rel PW_MF060_MF050]     ; xmm7=(tmp2H)
-        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0L)
-        pmaddwd   xmm3,[rel PW_F145_MF021]      ; xmm3=(tmp0H)
-
-        paddd   xmm6,xmm4               ; xmm6=tmp2L
-        paddd   xmm7,xmm5               ; xmm7=tmp2H
-        paddd   xmm2,xmm0               ; xmm2=tmp0L
-        paddd   xmm3,xmm1               ; xmm3=tmp0H
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      xmm1,xmm1
-        pxor      xmm2,xmm2
-        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
-        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
-        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-        movdqa    xmm3,xmm5             ; xmm5=in2=z2
-        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
-        punpckhwd xmm3,xmm0
-        pmaddwd   xmm5,[rel PW_F184_MF076]      ; xmm5=tmp2L
-        pmaddwd   xmm3,[rel PW_F184_MF076]      ; xmm3=tmp2H
-
-        movdqa  xmm4,xmm1
-        movdqa  xmm0,xmm2
-        paddd   xmm1,xmm5               ; xmm1=tmp10L
-        paddd   xmm2,xmm3               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp12L
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        ; -- Final output stage
-
-        movdqa  xmm5,xmm1
-        movdqa  xmm3,xmm2
-        paddd   xmm1,xmm6               ; xmm1=data0L
-        paddd   xmm2,xmm7               ; xmm2=data0H
-        psubd   xmm5,xmm6               ; xmm5=data3L
-        psubd   xmm3,xmm7               ; xmm3=data3H
-
-        movdqa  xmm6,[rel PD_DESCALE_P1_4]      ; xmm6=[rel PD_DESCALE_P1_4]
-
-        paddd   xmm1,xmm6
-        paddd   xmm2,xmm6
-        psrad   xmm1,DESCALE_P1_4
-        psrad   xmm2,DESCALE_P1_4
-        paddd   xmm5,xmm6
-        paddd   xmm3,xmm6
-        psrad   xmm5,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
-
-        movdqa  xmm2,xmm4
-        movdqa  xmm3,xmm0
-        paddd   xmm4,xmm7               ; xmm4=data1L
-        paddd   xmm0,xmm6               ; xmm0=data1H
-        psubd   xmm2,xmm7               ; xmm2=data2L
-        psubd   xmm3,xmm6               ; xmm3=data2H
-
-        movdqa  xmm7,[rel PD_DESCALE_P1_4]      ; xmm7=[rel PD_DESCALE_P1_4]
-
-        paddd   xmm4,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm4,DESCALE_P1_4
-        psrad   xmm0,DESCALE_P1_4
-        paddd   xmm2,xmm7
-        paddd   xmm3,xmm7
-        psrad   xmm2,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     rax, [original_rbp]
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; -- Even part
-
-        pxor      xmm4,xmm4
-        punpcklwd xmm4,xmm1             ; xmm4=tmp0
-        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-        ; -- Odd part
-
-        punpckhwd xmm1,xmm0
-        punpckhwd xmm6,xmm3
-        movdqa    xmm5,xmm1
-        movdqa    xmm2,xmm6
-        pmaddwd   xmm1,[rel PW_F256_F089]       ; xmm1=(tmp2)
-        pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2)
-        pmaddwd   xmm5,[rel PW_F106_MF217]      ; xmm5=(tmp0)
-        pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0)
-
-        paddd     xmm6,xmm1             ; xmm6=tmp2
-        paddd     xmm2,xmm5             ; xmm2=tmp0
-
-        ; -- Even part
-
-        punpcklwd xmm0,xmm3
-        pmaddwd   xmm0,[rel PW_F184_MF076]      ; xmm0=tmp2
-
-        movdqa    xmm7,xmm4
-        paddd     xmm4,xmm0             ; xmm4=tmp10
-        psubd     xmm7,xmm0             ; xmm7=tmp12
-
-        ; -- Final output stage
-
-        movdqa  xmm1,[rel PD_DESCALE_P2_4]      ; xmm1=[rel PD_DESCALE_P2_4]
-
-        movdqa  xmm5,xmm4
-        movdqa  xmm3,xmm7
-        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
-        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
-        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
-        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-
-        paddd   xmm4,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm4,DESCALE_P2_4
-        psrad   xmm7,DESCALE_P2_4
-        paddd   xmm5,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm5,DESCALE_P2_4
-        psrad   xmm3,DESCALE_P2_4
-
-        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
-        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
-
-        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
-        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
-
-        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-        paddb     xmm4,[rel PB_CENTERJSAMP]
-
-        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
-        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-        mov     rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
-        movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-
-        uncollect_args
-        mov     rsp,rbp         ; rsp <- aligned rbp
-        pop     rsp             ; rsp <- original rbp
-        pop     rbp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-; r10 = void *dct_table
-; r11 = JCOEFPTR coef_block
-; r12 = JSAMPARRAY output_buf
-; r13 = JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     rdx, r10                ; quantptr
-        mov     rsi, r11                ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-        pcmpeqd   xmm7,xmm7
-        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
-        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
-        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
-        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
-        pmaddwd   xmm4,[rel PW_F362_MF127]
-        pmaddwd   xmm5,[rel PW_F085_MF072]
-
-        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
-        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
-        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
-        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
-        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
-        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
-        pmaddwd xmm0,[rel PW_F362_MF127]
-        pmaddwd xmm2,[rel PW_F085_MF072]
-
-        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
-        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
-
-        ; -- Even part
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
-        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
-        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
-        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-        ; -- Final output stage
-
-        movdqa  xmm3,xmm6
-        movdqa  xmm5,xmm1
-        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-        movdqa  xmm2,[rel PD_DESCALE_P1_2]      ; xmm2=[rel PD_DESCALE_P1_2]
-
-        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
-
-        movdqa     xmm7,xmm1
-        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
-        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
-
-        paddd   xmm6,xmm2
-        psrad   xmm6,DESCALE_P1_2
-
-        paddd   xmm1,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm1,DESCALE_P1_2
-        psrad   xmm7,DESCALE_P1_2
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     rdi, r12        ; (JSAMPROW *)
-        mov     eax, r13d
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-        pmaddwd   xmm1,[rel PW_F362_MF127]
-        pmaddwd   xmm7,[rel PW_F085_MF072]
-
-        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
-
-        ; -- Even part
-
-        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
-
-        ; -- Final output stage
-
-        movdqa    xmm4,xmm6
-        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
-
-        paddd     xmm6,[rel PD_DESCALE_P2_2]
-        psrad     xmm6,DESCALE_P2_2
-
-        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-        paddb     xmm6,[rel PB_CENTERJSAMP]
-
-        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
-        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
-
-        mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-        mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-        mov     WORD [rdx+rax*SIZEOF_JSAMPLE], bx
-        mov     WORD [rsi+rax*SIZEOF_JSAMPLE], cx
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jidctred-sse2.asm b/media/libjpeg/simd/jidctred-sse2.asm
deleted file mode 100644
index 232d9838d6..0000000000
--- a/media/libjpeg/simd/jidctred-sse2.asm
+++ /dev/null
@@ -1,593 +0,0 @@
-;
-; jidctred.asm - reduced-size IDCT (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; This file contains inverse-DCT routines that produce reduced-size
-; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
-; The following code is based directly on the IJG's original jidctred.c;
-; see the jidctred.c for more details.
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-
-%define CONST_BITS      13
-%define PASS1_BITS      2
-
-%define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
-%define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
-%define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
-%define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
-
-%if CONST_BITS == 13
-F_0_211 equ      1730           ; FIX(0.211164243)
-F_0_509 equ      4176           ; FIX(0.509795579)
-F_0_601 equ      4926           ; FIX(0.601344887)
-F_0_720 equ      5906           ; FIX(0.720959822)
-F_0_765 equ      6270           ; FIX(0.765366865)
-F_0_850 equ      6967           ; FIX(0.850430095)
-F_0_899 equ      7373           ; FIX(0.899976223)
-F_1_061 equ      8697           ; FIX(1.061594337)
-F_1_272 equ     10426           ; FIX(1.272758580)
-F_1_451 equ     11893           ; FIX(1.451774981)
-F_1_847 equ     15137           ; FIX(1.847759065)
-F_2_172 equ     17799           ; FIX(2.172734803)
-F_2_562 equ     20995           ; FIX(2.562915447)
-F_3_624 equ     29692           ; FIX(3.624509785)
-%else
-; NASM cannot do compile-time arithmetic on floating-point constants.
-%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
-F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
-F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
-F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
-F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
-F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
-F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
-F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
-F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
-F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
-F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
-F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
-F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
-F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
-F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
-%endif
-
-; --------------------------------------------------------------------------
-        SECTION SEG_CONST
-
-        alignz  16
-        global  EXTN(jconst_idct_red_sse2)
-
-EXTN(jconst_idct_red_sse2):
-
-PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
-PW_F256_F089    times 4 dw  F_2_562, F_0_899
-PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
-PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
-PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
-PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
-PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
-PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
-PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
-PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
-PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
-PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
-
-        alignz  16
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 4x4 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-%define original_ebp    ebp+0
-%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM          2
-
-        align   16
-        global  EXTN(jsimd_idct_4x4_sse2)
-
-EXTN(jsimd_idct_4x4_sse2):
-        push    ebp
-        mov     eax,esp                         ; eax = original ebp
-        sub     esp, byte 4
-        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-        mov     [esp],eax
-        mov     ebp,esp                         ; ebp = aligned ebp
-        lea     esp, [wk(0)]
-        pushpic ebx
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-;       mov     eax, [original_ebp]
-        mov     edx, POINTER [dct_table(eax)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
-
-%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
-        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        jnz     short .columnDCT
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        por     xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        por     xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        por     xmm0,xmm1
-        packsswb xmm0,xmm0
-        packsswb xmm0,xmm0
-        movd    eax,xmm0
-        test    eax,eax
-        jnz     short .columnDCT
-
-        ; -- AC terms all zero
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        psllw   xmm0,PASS1_BITS
-
-        movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
-        punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
-        punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
-
-        pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
-        pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
-        pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
-        pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
-
-        jmp     near .column_end
-        alignx  16,7
-%endif
-.columnDCT:
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        movdqa    xmm4,xmm0
-        movdqa    xmm5,xmm0
-        punpcklwd xmm4,xmm1
-        punpckhwd xmm5,xmm1
-        movdqa    xmm0,xmm4
-        movdqa    xmm1,xmm5
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F256_F089)]       ; xmm4=(tmp2L)
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F256_F089)]       ; xmm5=(tmp2H)
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm0=(tmp0L)
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm1=(tmp0H)
-
-        movdqa    xmm6,xmm2
-        movdqa    xmm7,xmm2
-        punpcklwd xmm6,xmm3
-        punpckhwd xmm7,xmm3
-        movdqa    xmm2,xmm6
-        movdqa    xmm3,xmm7
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2L)
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm7=(tmp2H)
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0L)
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm3=(tmp0H)
-
-        paddd   xmm6,xmm4               ; xmm6=tmp2L
-        paddd   xmm7,xmm5               ; xmm7=tmp2H
-        paddd   xmm2,xmm0               ; xmm2=tmp0L
-        paddd   xmm3,xmm1               ; xmm3=tmp0H
-
-        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
-        movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
-
-        ; -- Even part
-
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        pxor      xmm1,xmm1
-        pxor      xmm2,xmm2
-        punpcklwd xmm1,xmm4             ; xmm1=tmp0L
-        punpckhwd xmm2,xmm4             ; xmm2=tmp0H
-        psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
-        psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
-
-        movdqa    xmm3,xmm5             ; xmm5=in2=z2
-        punpcklwd xmm5,xmm0             ; xmm0=in6=z3
-        punpckhwd xmm3,xmm0
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm5=tmp2L
-        pmaddwd   xmm3,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm3=tmp2H
-
-        movdqa  xmm4,xmm1
-        movdqa  xmm0,xmm2
-        paddd   xmm1,xmm5               ; xmm1=tmp10L
-        paddd   xmm2,xmm3               ; xmm2=tmp10H
-        psubd   xmm4,xmm5               ; xmm4=tmp12L
-        psubd   xmm0,xmm3               ; xmm0=tmp12H
-
-        ; -- Final output stage
-
-        movdqa  xmm5,xmm1
-        movdqa  xmm3,xmm2
-        paddd   xmm1,xmm6               ; xmm1=data0L
-        paddd   xmm2,xmm7               ; xmm2=data0H
-        psubd   xmm5,xmm6               ; xmm5=data3L
-        psubd   xmm3,xmm7               ; xmm3=data3H
-
-        movdqa  xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm6=[PD_DESCALE_P1_4]
-
-        paddd   xmm1,xmm6
-        paddd   xmm2,xmm6
-        psrad   xmm1,DESCALE_P1_4
-        psrad   xmm2,DESCALE_P1_4
-        paddd   xmm5,xmm6
-        paddd   xmm3,xmm6
-        psrad   xmm5,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
-        packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
-
-        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
-        movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
-
-        movdqa  xmm2,xmm4
-        movdqa  xmm3,xmm0
-        paddd   xmm4,xmm7               ; xmm4=data1L
-        paddd   xmm0,xmm6               ; xmm0=data1H
-        psubd   xmm2,xmm7               ; xmm2=data2L
-        psubd   xmm3,xmm6               ; xmm3=data2H
-
-        movdqa  xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)]      ; xmm7=[PD_DESCALE_P1_4]
-
-        paddd   xmm4,xmm7
-        paddd   xmm0,xmm7
-        psrad   xmm4,DESCALE_P1_4
-        psrad   xmm0,DESCALE_P1_4
-        paddd   xmm2,xmm7
-        paddd   xmm3,xmm7
-        psrad   xmm2,DESCALE_P1_4
-        psrad   xmm3,DESCALE_P1_4
-
-        packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
-        packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
-
-        movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
-        punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
-        punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
-        movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
-        punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
-        punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
-
-        movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
-        punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
-        punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
-        movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
-        punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
-        punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
-.column_end:
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     eax, [original_ebp]
-        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(eax)]
-
-        ; -- Even part
-
-        pxor      xmm4,xmm4
-        punpcklwd xmm4,xmm1             ; xmm4=tmp0
-        psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
-
-        ; -- Odd part
-
-        punpckhwd xmm1,xmm0
-        punpckhwd xmm6,xmm3
-        movdqa    xmm5,xmm1
-        movdqa    xmm2,xmm6
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F256_F089)]       ; xmm1=(tmp2)
-        pmaddwd   xmm6,[GOTOFF(ebx,PW_MF060_MF050)]     ; xmm6=(tmp2)
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F106_MF217)]      ; xmm5=(tmp0)
-        pmaddwd   xmm2,[GOTOFF(ebx,PW_F145_MF021)]      ; xmm2=(tmp0)
-
-        paddd     xmm6,xmm1             ; xmm6=tmp2
-        paddd     xmm2,xmm5             ; xmm2=tmp0
-
-        ; -- Even part
-
-        punpcklwd xmm0,xmm3
-        pmaddwd   xmm0,[GOTOFF(ebx,PW_F184_MF076)]      ; xmm0=tmp2
-
-        movdqa    xmm7,xmm4
-        paddd     xmm4,xmm0             ; xmm4=tmp10
-        psubd     xmm7,xmm0             ; xmm7=tmp12
-
-        ; -- Final output stage
-
-        movdqa  xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)]      ; xmm1=[PD_DESCALE_P2_4]
-
-        movdqa  xmm5,xmm4
-        movdqa  xmm3,xmm7
-        paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
-        paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
-        psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
-        psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
-
-        paddd   xmm4,xmm1
-        paddd   xmm7,xmm1
-        psrad   xmm4,DESCALE_P2_4
-        psrad   xmm7,DESCALE_P2_4
-        paddd   xmm5,xmm1
-        paddd   xmm3,xmm1
-        psrad   xmm5,DESCALE_P2_4
-        psrad   xmm3,DESCALE_P2_4
-
-        packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
-        packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
-
-        movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
-        punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
-        punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
-
-        movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
-        punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
-        punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
-
-        packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
-        paddb     xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
-        pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
-        pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
-        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
-        mov     edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
-        movd    XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
-        movd    XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-        poppic  ebx
-        mov     esp,ebp         ; esp <- aligned ebp
-        pop     esp             ; esp <- original ebp
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Perform dequantization and inverse DCT on one block of coefficients,
-; producing a reduced-size 2x2 output block.
-;
-; GLOBAL(void)
-; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
-;                      JSAMPARRAY output_buf, JDIMENSION output_col)
-;
-
-%define dct_table(b)    (b)+8           ; void *dct_table
-%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
-%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
-%define output_col(b)   (b)+20          ; JDIMENSION output_col
-
-        align   16
-        global  EXTN(jsimd_idct_2x2_sse2)
-
-EXTN(jsimd_idct_2x2_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        get_GOT ebx             ; get GOT address
-
-        ; ---- Pass 1: process columns from input.
-
-        mov     edx, POINTER [dct_table(ebp)]           ; quantptr
-        mov     esi, JCOEFPTR [coef_block(ebp)]         ; inptr
-
-        ; | input:                  | result:        |
-        ; | 00 01 ** 03 ** 05 ** 07 |                |
-        ; | 10 11 ** 13 ** 15 ** 17 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
-        ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
-        ; | 50 51 ** 53 ** 55 ** 57 |                |
-        ; | ** ** ** ** ** ** ** ** |                |
-        ; | 70 71 ** 73 ** 75 ** 77 |                |
-
-        ; -- Odd part
-
-        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
-        movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-        pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
-        ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
-
-        pcmpeqd   xmm7,xmm7
-        pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
-
-        movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
-        movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
-        punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
-        punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
-        pmaddwd   xmm4,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   xmm5,[GOTOFF(ebx,PW_F085_MF072)]
-
-        psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
-        pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
-        psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
-        pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
-        por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
-        por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
-        pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
-        paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
-
-        ; -- Even part
-
-        movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
-        pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
-
-        ; xmm6=(00 01 ** 03 ** 05 ** 07)
-
-        movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
-        pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
-        pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
-        psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
-        psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
-
-        ; -- Final output stage
-
-        movdqa  xmm3,xmm6
-        movdqa  xmm5,xmm1
-        paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
-        paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
-        psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
-        psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
-
-        movdqa  xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)]      ; xmm2=[PD_DESCALE_P1_2]
-
-        punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
-
-        movdqa     xmm7,xmm1
-        punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
-        punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
-
-        paddd   xmm6,xmm2
-        psrad   xmm6,DESCALE_P1_2
-
-        paddd   xmm1,xmm2
-        paddd   xmm7,xmm2
-        psrad   xmm1,DESCALE_P1_2
-        psrad   xmm7,DESCALE_P1_2
-
-        ; -- Prefetch the next coefficient block
-
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
-        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
-
-        ; ---- Pass 2: process rows, store into output array.
-
-        mov     edi, JSAMPARRAY [output_buf(ebp)]       ; (JSAMPROW *)
-        mov     eax, JDIMENSION [output_col(ebp)]
-
-        ; | input:| result:|
-        ; | A0 B0 |        |
-        ; | A1 B1 | C0 C1  |
-        ; | A3 B3 | D0 D1  |
-        ; | A5 B5 |        |
-        ; | A7 B7 |        |
-
-        ; -- Odd part
-
-        packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
-        packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
-        pmaddwd   xmm1,[GOTOFF(ebx,PW_F362_MF127)]
-        pmaddwd   xmm7,[GOTOFF(ebx,PW_F085_MF072)]
-
-        paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
-
-        ; -- Even part
-
-        pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
-
-        ; -- Final output stage
-
-        movdqa    xmm4,xmm6
-        paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
-        psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
-
-        punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
-
-        paddd     xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)]
-        psrad     xmm6,DESCALE_P2_2
-
-        packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
-        packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
-        paddb     xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)]
-
-        pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
-        pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
-
-        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
-        mov     esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
-        mov     WORD [edx+eax*SIZEOF_JSAMPLE], bx
-        mov     WORD [esi+eax*SIZEOF_JSAMPLE], cx
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jpeg_nbits_table.inc b/media/libjpeg/simd/jpeg_nbits_table.inc
deleted file mode 100644
index cbc69904e1..0000000000
--- a/media/libjpeg/simd/jpeg_nbits_table.inc
+++ /dev/null
@@ -1,4097 +0,0 @@
-jpeg_nbits_table db  \
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  \
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/media/libjpeg/simd/jquant-3dn.asm b/media/libjpeg/simd/jquant-3dn.asm
deleted file mode 100644
index 0b4164b261..0000000000
--- a/media/libjpeg/simd/jquant-3dn.asm
+++ /dev/null
@@ -1,232 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (3DNow! & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                             FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_3dnow)
-
-EXTN(jsimd_convsamp_float_3dnow):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  mm7,mm7
-        psllw    mm7,7
-        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   mm0,mm7                         ; mm0=(01234567)
-        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
-
-        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
-        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
-        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
-        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
-
-        punpcklwd mm4,mm2                       ; mm4=(***0***1)
-        punpckhwd mm2,mm2                       ; mm2=(***2***3)
-        punpcklwd mm5,mm0                       ; mm5=(***4***5)
-        punpckhwd mm0,mm0                       ; mm0=(***6***7)
-
-        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(01)
-        psrad   mm2,(DWORD_BIT-BYTE_BIT)        ; mm2=(23)
-        pi2fd   mm4,mm4
-        pi2fd   mm2,mm2
-        psrad   mm5,(DWORD_BIT-BYTE_BIT)        ; mm5=(45)
-        psrad   mm0,(DWORD_BIT-BYTE_BIT)        ; mm0=(67)
-        pi2fd   mm5,mm5
-        pi2fd   mm0,mm0
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2
-        movq    MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
-        movq    MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
-
-        punpcklwd mm6,mm3                       ; mm6=(***8***9)
-        punpckhwd mm3,mm3                       ; mm3=(***A***B)
-        punpcklwd mm4,mm1                       ; mm4=(***C***D)
-        punpckhwd mm1,mm1                       ; mm1=(***E***F)
-
-        psrad   mm6,(DWORD_BIT-BYTE_BIT)        ; mm6=(89)
-        psrad   mm3,(DWORD_BIT-BYTE_BIT)        ; mm3=(AB)
-        pi2fd   mm6,mm6
-        pi2fd   mm3,mm3
-        psrad   mm4,(DWORD_BIT-BYTE_BIT)        ; mm4=(CD)
-        psrad   mm1,(DWORD_BIT-BYTE_BIT)        ; mm1=(EF)
-        pi2fd   mm4,mm4
-        pi2fd   mm1,mm1
-
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3
-        movq    MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4
-        movq    MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .convloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                             FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_3dnow)
-
-EXTN(jsimd_quantize_float_3dnow):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov       eax, 0x4B400000       ; (float)0x00C00000 (rndint_magic)
-        movd      mm7,eax
-        punpckldq mm7,mm7               ; mm7={12582912.0F 12582912.0F}
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movq    mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
-
-        pfadd   mm0,mm7                 ; mm0=(00 ** 01 **)
-        pfadd   mm1,mm7                 ; mm1=(02 ** 03 **)
-        pfadd   mm2,mm7                 ; mm0=(04 ** 05 **)
-        pfadd   mm3,mm7                 ; mm1=(06 ** 07 **)
-
-        movq      mm4,mm0
-        punpcklwd mm0,mm1               ; mm0=(00 02 ** **)
-        punpckhwd mm4,mm1               ; mm4=(01 03 ** **)
-        movq      mm5,mm2
-        punpcklwd mm2,mm3               ; mm2=(04 06 ** **)
-        punpckhwd mm5,mm3               ; mm5=(05 07 ** **)
-
-        punpcklwd mm0,mm4               ; mm0=(00 01 02 03)
-        punpcklwd mm2,mm5               ; mm2=(04 05 06 07)
-
-        movq    mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-        movq    mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)]
-        movq    mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)]
-        pfmul   mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
-        pfmul   mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
-
-        pfadd   mm6,mm7                 ; mm0=(10 ** 11 **)
-        pfadd   mm1,mm7                 ; mm4=(12 ** 13 **)
-        pfadd   mm3,mm7                 ; mm0=(14 ** 15 **)
-        pfadd   mm4,mm7                 ; mm4=(16 ** 17 **)
-
-        movq      mm5,mm6
-        punpcklwd mm6,mm1               ; mm6=(10 12 ** **)
-        punpckhwd mm5,mm1               ; mm5=(11 13 ** **)
-        movq      mm1,mm3
-        punpcklwd mm3,mm4               ; mm3=(14 16 ** **)
-        punpckhwd mm1,mm4               ; mm1=(15 17 ** **)
-
-        punpcklwd mm6,mm5               ; mm6=(10 11 12 13)
-        punpcklwd mm3,mm1               ; mm3=(14 15 16 17)
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     near .quantloop
-
-        femms           ; empty MMX/3DNow! state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquant-mmx.asm b/media/libjpeg/simd/jquant-mmx.asm
deleted file mode 100644
index aed6071bfc..0000000000
--- a/media/libjpeg/simd/jquant-mmx.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                     DCTELEM *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_mmx)
-
-EXTN(jsimd_convsamp_mmx):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pxor    mm6,mm6                 ; mm6=(all 0's)
-        pcmpeqw mm7,mm7
-        psllw   mm7,7                   ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm0=(01234567)
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm1=(89ABCDEF)
-
-        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE]    ; mm2=(GHIJKLMN)
-        movq    mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE]    ; mm3=(OPQRSTUV)
-
-        movq      mm4,mm0
-        punpcklbw mm0,mm6               ; mm0=(0123)
-        punpckhbw mm4,mm6               ; mm4=(4567)
-        movq      mm5,mm1
-        punpcklbw mm1,mm6               ; mm1=(89AB)
-        punpckhbw mm5,mm6               ; mm5=(CDEF)
-
-        paddw   mm0,mm7
-        paddw   mm4,mm7
-        paddw   mm1,mm7
-        paddw   mm5,mm7
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
-
-        movq      mm0,mm2
-        punpcklbw mm2,mm6               ; mm2=(GHIJ)
-        punpckhbw mm0,mm6               ; mm0=(KLMN)
-        movq      mm4,mm3
-        punpcklbw mm3,mm6               ; mm3=(OPQR)
-        punpckhbw mm4,mm6               ; mm4=(STUV)
-
-        paddw   mm2,mm7
-        paddw   mm0,mm7
-        paddw   mm3,mm7
-        paddw   mm4,mm7
-
-        movq    MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
-        movq    MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
-        movq    MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
-
-        add     esi, byte 4*SIZEOF_JSAMPROW
-        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     short .convloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM *divisors,
-;                     DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-%define SHIFT(m,n,b)      MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM *divisors
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_mmx)
-
-EXTN(jsimd_quantize_mmx):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     ah, 2
-        alignx  16,7
-.quantloop1:
-        mov     al, DCTSIZE2/8/2
-        alignx  16,7
-.quantloop2:
-        movq    mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-        movq    mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
-
-        movq    mm0,mm2
-        movq    mm1,mm3
-
-        psraw   mm2,(WORD_BIT-1)  ; -1 if value < 0, 0 otherwise
-        psraw   mm3,(WORD_BIT-1)
-
-        pxor    mm0,mm2   ; val = -val
-        pxor    mm1,mm3
-        psubw   mm0,mm2
-        psubw   mm1,mm3
-
-        ;
-        ; MMX is an annoyingly crappy instruction set. It has two
-        ; misfeatures that are causing problems here:
-        ;
-        ; - All multiplications are signed.
-        ;
-        ; - The second operand for the shifts is not treated as packed.
-        ;
-        ;
-        ; We work around the first problem by implementing this algorithm:
-        ;
-        ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
-        ; {
-        ;   enum { SHORT_BIT = 16 };
-        ;   signed short sx = (signed short) x;
-        ;   signed short sy = (signed short) y;
-        ;   signed long sz;
-        ;
-        ;   sz = (long) sx * (long) sy;     /* signed multiply */
-        ;
-        ;   if (sx < 0) sz += (long) sy << SHORT_BIT;
-        ;   if (sy < 0) sz += (long) sx << SHORT_BIT;
-        ;
-        ;   return (unsigned long) sz;
-        ; }
-        ;
-        ; (note that a negative sx adds _sy_ and vice versa)
-        ;
-        ; For the second problem, we replace the shift by a multiplication.
-        ; Unfortunately that means we have to deal with the signed issue again.
-        ;
-
-        paddw   mm0, MMWORD [CORRECTION(0,0,edx)]   ; correction + roundfactor
-        paddw   mm1, MMWORD [CORRECTION(0,1,edx)]
-
-        movq    mm4,mm0   ; store current value for later
-        movq    mm5,mm1
-        pmulhw  mm0, MMWORD [RECIPROCAL(0,0,edx)]   ; reciprocal
-        pmulhw  mm1, MMWORD [RECIPROCAL(0,1,edx)]
-        paddw   mm0,mm4         ; reciprocal is always negative (MSB=1),
-        paddw   mm1,mm5   ; so we always need to add the initial value
-                        ; (input value is never negative as we
-                        ; inverted it at the start of this routine)
-
-        ; here it gets a bit tricky as both scale
-        ; and mm0/mm1 can be negative
-        movq    mm6, MMWORD [SCALE(0,0,edx)]    ; scale
-        movq    mm7, MMWORD [SCALE(0,1,edx)]
-        movq    mm4,mm0
-        movq    mm5,mm1
-        pmulhw  mm0,mm6
-        pmulhw  mm1,mm7
-
-        psraw   mm6,(WORD_BIT-1)    ; determine if scale is negative
-        psraw   mm7,(WORD_BIT-1)
-
-        pand    mm6,mm4             ; and add input if it is
-        pand    mm7,mm5
-        paddw   mm0,mm6
-        paddw   mm1,mm7
-
-        psraw   mm4,(WORD_BIT-1)    ; then check if negative input
-        psraw   mm5,(WORD_BIT-1)
-
-        pand    mm4, MMWORD [SCALE(0,0,edx)]    ; and add scale if it is
-        pand    mm5, MMWORD [SCALE(0,1,edx)]
-        paddw   mm0,mm4
-        paddw   mm1,mm5
-
-        pxor    mm0,mm2   ; val = -val
-        pxor    mm1,mm3
-        psubw   mm0,mm2
-        psubw   mm1,mm3
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
-
-        add     esi, byte 8*SIZEOF_DCTELEM
-        add     edx, byte 8*SIZEOF_DCTELEM
-        add     edi, byte 8*SIZEOF_JCOEF
-        dec     al
-        jnz     near .quantloop2
-        dec     ah
-        jnz     near .quantloop1        ; to avoid branch misprediction
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquant-sse.asm b/media/libjpeg/simd/jquant-sse.asm
deleted file mode 100644
index 1baf88f257..0000000000
--- a/media/libjpeg/simd/jquant-sse.asm
+++ /dev/null
@@ -1,210 +0,0 @@
-;
-; jquant.asm - sample data conversion and quantization (SSE & MMX)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                           FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse)
-
-EXTN(jsimd_convsamp_float_sse):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  mm7,mm7
-        psllw    mm7,7
-        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   mm0,mm7                         ; mm0=(01234567)
-        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
-
-        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
-        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
-        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
-        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
-
-        punpcklwd mm4,mm2                       ; mm4=(***0***1)
-        punpckhwd mm2,mm2                       ; mm2=(***2***3)
-        punpcklwd mm5,mm0                       ; mm5=(***4***5)
-        punpckhwd mm0,mm0                       ; mm0=(***6***7)
-
-        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
-        psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
-        cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
-        cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
-        psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
-        psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
-        cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
-        cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
-
-        punpcklwd mm6,mm3                       ; mm6=(***8***9)
-        punpckhwd mm3,mm3                       ; mm3=(***A***B)
-        punpcklwd mm4,mm1                       ; mm4=(***C***D)
-        punpckhwd mm1,mm1                       ; mm1=(***E***F)
-
-        psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
-        psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
-        cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
-        cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
-        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
-        psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
-        cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
-        cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
-
-        movlhps   xmm0,xmm1                     ; xmm0=(0123)
-        movlhps   xmm2,xmm3                     ; xmm2=(4567)
-        movlhps   xmm4,xmm5                     ; xmm4=(89AB)
-        movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     near .convloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                           FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse)
-
-EXTN(jsimd_quantize_float_sse):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        movhlps  xmm4,xmm0
-        movhlps  xmm5,xmm1
-
-        cvtps2pi mm0,xmm0
-        cvtps2pi mm1,xmm1
-        cvtps2pi mm4,xmm4
-        cvtps2pi mm5,xmm5
-
-        movhlps  xmm6,xmm2
-        movhlps  xmm7,xmm3
-
-        cvtps2pi mm2,xmm2
-        cvtps2pi mm3,xmm3
-        cvtps2pi mm6,xmm6
-        cvtps2pi mm7,xmm7
-
-        packssdw mm0,mm4
-        packssdw mm1,mm5
-        packssdw mm2,mm6
-        packssdw mm3,mm7
-
-        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
-        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
-        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
-        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     short .quantloop
-
-        emms            ; empty MMX state
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquantf-sse2-64.asm b/media/libjpeg/simd/jquantf-sse2-64.asm
deleted file mode 100644
index ef5c1f959e..0000000000
--- a/media/libjpeg/simd/jquantf-sse2-64.asm
+++ /dev/null
@@ -1,157 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        pcmpeqw  xmm7,xmm7
-        psllw    xmm7,7
-        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov rsi, r10
-        mov     eax, r11d
-        mov rdi, r12
-        mov     rcx, DCTSIZE/2
-.convloop:
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
-        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
-
-        psubb   xmm0,xmm7                       ; xmm0=(01234567)
-        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
-
-        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
-        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
-
-        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
-        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
-        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
-        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
-
-        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
-        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
-        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
-        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
-
-        add     rsi, byte 2*SIZEOF_JSAMPROW
-        add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     rcx
-        jnz     short .convloop
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                         FAST_FLOAT *workspace);
-;
-
-; r10 = JCOEFPTR coef_block
-; r11 = FAST_FLOAT *divisors
-; r12 = FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov rsi, r12
-        mov rdx, r11
-        mov rdi, r10
-        mov     rax, DCTSIZE2/16
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
-
-        cvtps2dq xmm0,xmm0
-        cvtps2dq xmm1,xmm1
-        cvtps2dq xmm2,xmm2
-        cvtps2dq xmm3,xmm3
-
-        packssdw xmm0,xmm1
-        packssdw xmm2,xmm3
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
-
-        add     rsi, byte 16*SIZEOF_FAST_FLOAT
-        add     rdx, byte 16*SIZEOF_FAST_FLOAT
-        add     rdi, byte 16*SIZEOF_JCOEF
-        dec     rax
-        jnz     short .quantloop
-
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquantf-sse2.asm b/media/libjpeg/simd/jquantf-sse2.asm
deleted file mode 100644
index 1cbc267400..0000000000
--- a/media/libjpeg/simd/jquantf-sse2.asm
+++ /dev/null
@@ -1,170 +0,0 @@
-;
-; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                            FAST_FLOAT *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_float_sse2)
-
-EXTN(jsimd_convsamp_float_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pcmpeqw  xmm7,xmm7
-        psllw    xmm7,7
-        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/2
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
-        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
-
-        psubb   xmm0,xmm7                       ; xmm0=(01234567)
-        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
-
-        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
-        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
-
-        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
-        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
-        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
-        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
-
-        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
-        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
-        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
-        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
-        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
-        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
-        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
-        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
-
-        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
-        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
-        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
-        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
-
-        add     esi, byte 2*SIZEOF_JSAMPROW
-        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
-        dec     ecx
-        jnz     short .convloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; GLOBAL(void)
-; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-;                            FAST_FLOAT *workspace);
-;
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; FAST_FLOAT *divisors
-%define workspace       ebp+16          ; FAST_FLOAT *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_float_sse2)
-
-EXTN(jsimd_quantize_float_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/16
-        alignx  16,7
-.quantloop:
-        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
-        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
-        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
-        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
-        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
-
-        cvtps2dq xmm0,xmm0
-        cvtps2dq xmm1,xmm1
-        cvtps2dq xmm2,xmm2
-        cvtps2dq xmm3,xmm3
-
-        packssdw xmm0,xmm1
-        packssdw xmm2,xmm3
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
-
-        add     esi, byte 16*SIZEOF_FAST_FLOAT
-        add     edx, byte 16*SIZEOF_FAST_FLOAT
-        add     edi, byte 16*SIZEOF_JCOEF
-        dec     eax
-        jnz     short .quantloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquanti-sse2-64.asm b/media/libjpeg/simd/jquanti-sse2-64.asm
deleted file mode 100644
index 66c4e51907..0000000000
--- a/media/libjpeg/simd/jquanti-sse2-64.asm
+++ /dev/null
@@ -1,186 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2009, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    64
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM *workspace);
-;
-
-; r10 = JSAMPARRAY sample_data
-; r11 = JDIMENSION start_col
-; r12 = DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-        push    rbx
-
-        pxor    xmm6,xmm6               ; xmm6=(all 0's)
-        pcmpeqw xmm7,xmm7
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        mov rsi, r10
-        mov eax, r11d
-        mov rdi, r12
-        mov     rcx, DCTSIZE/4
-.convloop:
-        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
-        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
-
-        mov     rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
-        movq    xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
-
-        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
-        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
-        paddw     xmm0,xmm7
-        paddw     xmm1,xmm7
-        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
-        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
-        paddw     xmm2,xmm7
-        paddw     xmm3,xmm7
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-        add     rsi, byte 4*SIZEOF_JSAMPROW
-        add     rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     rcx
-        jnz     short .convloop
-
-        pop     rbx
-        uncollect_args
-        pop     rbp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-;                      DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-; r10 = JCOEFPTR coef_block
-; r11 = DCTELEM *divisors
-; r12 = DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
-        push    rbp
-        mov     rax,rsp
-        mov     rbp,rsp
-        collect_args
-
-        mov rsi, r12
-        mov rdx, r11
-        mov rdi, r10
-        mov     rax, DCTSIZE2/32
-.quantloop:
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
-        movdqa  xmm0,xmm4
-        movdqa  xmm1,xmm5
-        movdqa  xmm2,xmm6
-        movdqa  xmm3,xmm7
-        psraw   xmm4,(WORD_BIT-1)
-        psraw   xmm5,(WORD_BIT-1)
-        psraw   xmm6,(WORD_BIT-1)
-        psraw   xmm7,(WORD_BIT-1)
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
-        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
-        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
-        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
-
-        paddw   xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
-        paddw   xmm1, XMMWORD [CORRECTION(1,0,rdx)]
-        paddw   xmm2, XMMWORD [CORRECTION(2,0,rdx)]
-        paddw   xmm3, XMMWORD [CORRECTION(3,0,rdx)]
-        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
-        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
-        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
-        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
-        pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)]  ; scale
-        pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
-        pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
-        pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
-
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4
-        psubw   xmm1,xmm5
-        psubw   xmm2,xmm6
-        psubw   xmm3,xmm7
-        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
-
-        add     rsi, byte 32*SIZEOF_DCTELEM
-        add     rdx, byte 32*SIZEOF_DCTELEM
-        add     rdi, byte 32*SIZEOF_JCOEF
-        dec     rax
-        jnz     near .quantloop
-
-        uncollect_args
-        pop     rbp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jquanti-sse2.asm b/media/libjpeg/simd/jquanti-sse2.asm
deleted file mode 100644
index aea8604e22..0000000000
--- a/media/libjpeg/simd/jquanti-sse2.asm
+++ /dev/null
@@ -1,199 +0,0 @@
-;
-; jquanti.asm - sample data conversion and quantization (SSE2)
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-%include "jdct.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Load data into workspace, applying unsigned->signed conversion
-;
-; GLOBAL(void)
-; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
-;                      DCTELEM *workspace);
-;
-
-%define sample_data     ebp+8           ; JSAMPARRAY sample_data
-%define start_col       ebp+12          ; JDIMENSION start_col
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_convsamp_sse2)
-
-EXTN(jsimd_convsamp_sse2):
-        push    ebp
-        mov     ebp,esp
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        pxor    xmm6,xmm6               ; xmm6=(all 0's)
-        pcmpeqw xmm7,xmm7
-        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
-        mov     eax, JDIMENSION [start_col]
-        mov     edi, POINTER [workspace]        ; (DCTELEM *)
-        mov     ecx, DCTSIZE/4
-        alignx  16,7
-.convloop:
-        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
-        movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
-
-        mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-        mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
-
-        movq    xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
-        movq    xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
-
-        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
-        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
-        paddw     xmm0,xmm7
-        paddw     xmm1,xmm7
-        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
-        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
-        paddw     xmm2,xmm7
-        paddw     xmm3,xmm7
-
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-        add     esi, byte 4*SIZEOF_JSAMPROW
-        add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
-        dec     ecx
-        jnz     short .convloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        pop     ebp
-        ret
-
-; --------------------------------------------------------------------------
-;
-; Quantize/descale the coefficients, and store into coef_block
-;
-; This implementation is based on an algorithm described in
-;   "How to optimize for the Pentium family of microprocessors"
-;   (http://www.agner.org/assem/).
-;
-; GLOBAL(void)
-; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
-;                      DCTELEM *workspace);
-;
-
-%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
-%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
-%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
-
-%define coef_block      ebp+8           ; JCOEFPTR coef_block
-%define divisors        ebp+12          ; DCTELEM *divisors
-%define workspace       ebp+16          ; DCTELEM *workspace
-
-        align   16
-        global  EXTN(jsimd_quantize_sse2)
-
-EXTN(jsimd_quantize_sse2):
-        push    ebp
-        mov     ebp,esp
-;       push    ebx             ; unused
-;       push    ecx             ; unused
-;       push    edx             ; need not be preserved
-        push    esi
-        push    edi
-
-        mov     esi, POINTER [workspace]
-        mov     edx, POINTER [divisors]
-        mov     edi, JCOEFPTR [coef_block]
-        mov     eax, DCTSIZE2/32
-        alignx  16,7
-.quantloop:
-        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
-        movdqa  xmm0,xmm4
-        movdqa  xmm1,xmm5
-        movdqa  xmm2,xmm6
-        movdqa  xmm3,xmm7
-        psraw   xmm4,(WORD_BIT-1)
-        psraw   xmm5,(WORD_BIT-1)
-        psraw   xmm6,(WORD_BIT-1)
-        psraw   xmm7,(WORD_BIT-1)
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
-        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
-        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
-        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
-
-        paddw   xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
-        paddw   xmm1, XMMWORD [CORRECTION(1,0,edx)]
-        paddw   xmm2, XMMWORD [CORRECTION(2,0,edx)]
-        paddw   xmm3, XMMWORD [CORRECTION(3,0,edx)]
-        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
-        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
-        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
-        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
-        pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]  ; scale
-        pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
-        pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
-        pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
-
-        pxor    xmm0,xmm4
-        pxor    xmm1,xmm5
-        pxor    xmm2,xmm6
-        pxor    xmm3,xmm7
-        psubw   xmm0,xmm4
-        psubw   xmm1,xmm5
-        psubw   xmm2,xmm6
-        psubw   xmm3,xmm7
-        movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
-        movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
-        movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
-        movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
-
-        add     esi, byte 32*SIZEOF_DCTELEM
-        add     edx, byte 32*SIZEOF_DCTELEM
-        add     edi, byte 32*SIZEOF_JCOEF
-        dec     eax
-        jnz     near .quantloop
-
-        pop     edi
-        pop     esi
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; unused
-;       pop     ebx             ; unused
-        pop     ebp
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jsimd.h b/media/libjpeg/simd/jsimd.h
index dc6ec430db..64747c6360 100644
--- a/media/libjpeg/simd/jsimd.h
+++ b/media/libjpeg/simd/jsimd.h
@@ -2,10 +2,12 @@
  * simd/jsimd.h
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2011, 2014-2016, D. R. Commander.
+ * Copyright (C) 2011, 2014-2016, 2018, 2020, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
- * Copyright (C) 2015-2016, Matthieu Darbois.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,857 +17,1242 @@
 
 /* Bitmask for supported acceleration methods */
 
-#define JSIMD_NONE       0x00
-#define JSIMD_MMX        0x01
-#define JSIMD_3DNOW      0x02
-#define JSIMD_SSE        0x04
-#define JSIMD_SSE2       0x08
-#define JSIMD_ARM_NEON   0x10
-#define JSIMD_MIPS_DSPR2 0x20
-#define JSIMD_ALTIVEC    0x40
+#define JSIMD_NONE     0x00
+#define JSIMD_MMX      0x01
+#define JSIMD_3DNOW    0x02
+#define JSIMD_SSE      0x04
+#define JSIMD_SSE2     0x08
+#define JSIMD_NEON     0x10
+#define JSIMD_DSPR2    0x20
+#define JSIMD_ALTIVEC  0x40
+#define JSIMD_AVX2     0x80
+#define JSIMD_MMI      0x100
 
 /* SIMD Ext: retrieve SIMD/CPU information */
-EXTERN(unsigned int) jpeg_simd_cpu_support (void);
+EXTERN(unsigned int) jpeg_simd_cpu_support(void);
 
 /* RGB & extended RGB --> YCC Colorspace Conversion */
 EXTERN(void) jsimd_rgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_ycc_convert_sse2[];
 EXTERN(void) jsimd_rgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+extern const int jconst_rgb_ycc_convert_avx2[];
+EXTERN(void) jsimd_rgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_neon
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+#ifndef NEON_INTRINSICS
 
 EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
-EXTERN(void) jsimd_rgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_ycc_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+#endif
+
+EXTERN(void) jsimd_rgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_ycc_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_ycc_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 /* RGB & extended RGB --> Grayscale Colorspace Conversion */
 EXTERN(void) jsimd_rgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_mmx
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 extern const int jconst_rgb_gray_convert_sse2[];
 EXTERN(void) jsimd_rgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_sse2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
-EXTERN(void) jsimd_rgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extrgbx_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgr_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extbgrx_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxbgr_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
-EXTERN(void) jsimd_extxrgb_gray_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+extern const int jconst_rgb_gray_convert_avx2[];
+EXTERN(void) jsimd_rgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_avx2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 EXTERN(void) jsimd_rgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extrgbx_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgr_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extbgrx_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxbgr_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 EXTERN(void) jsimd_extxrgb_gray_convert_altivec
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows);
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
 
 /* YCC --> RGB & extended RGB Colorspace Conversion */
 EXTERN(void) jsimd_ycc_rgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_mmx
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 extern const int jconst_ycc_rgb_convert_sse2[];
 EXTERN(void) jsimd_ycc_rgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_sse2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+extern const int jconst_ycc_rgb_convert_avx2[];
+EXTERN(void) jsimd_ycc_rgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_avx2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_rgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_rgb565_convert_neon
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+#ifndef NEON_INTRINSICS
 
 EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
-EXTERN(void) jsimd_ycc_rgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extrgbx_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgr_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extbgrx_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxbgr_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
-EXTERN(void) jsimd_ycc_extxrgb_convert_mips_dspr2
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+#endif
+
+EXTERN(void) jsimd_ycc_rgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_dspr2
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+
+EXTERN(void) jsimd_ycc_rgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extrgbx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extbgrx_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxbgr_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
+EXTERN(void) jsimd_ycc_extxrgb_convert_mmi
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 EXTERN(void) jsimd_ycc_rgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extrgbx_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgr_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extbgrx_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxbgr_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 EXTERN(void) jsimd_ycc_extxrgb_convert_altivec
-        (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
-         JSAMPARRAY output_buf, int num_rows);
+  (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
+   JSAMPARRAY output_buf, int num_rows);
 
 /* NULL Colorspace Conversion */
-EXTERN(void) jsimd_c_null_convert_mips_dspr2
-        (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-         JDIMENSION output_row, int num_rows, int num_components);
+EXTERN(void) jsimd_c_null_convert_dspr2
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows, int num_components);
 
 /* h2v1 Downsampling */
 EXTERN(void) jsimd_h2v1_downsample_mmx
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_sse2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v1_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_neon
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v1_downsample_mips_dspr2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v1_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v1_downsample_altivec
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 /* h2v2 Downsampling */
 EXTERN(void) jsimd_h2v2_downsample_mmx
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_sse2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_avx2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_neon
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
-EXTERN(void) jsimd_h2v2_downsample_mips_dspr2
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+EXTERN(void) jsimd_h2v2_downsample_dspr2
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
+
+EXTERN(void) jsimd_h2v2_downsample_mmi
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 EXTERN(void) jsimd_h2v2_downsample_altivec
-        (JDIMENSION image_width, int max_v_samp_factor,
-         JDIMENSION v_samp_factor, JDIMENSION width_blocks,
-         JSAMPARRAY input_data, JSAMPARRAY output_data);
+  (JDIMENSION image_width, int max_v_samp_factor, JDIMENSION v_samp_factor,
+   JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data);
 
 /* h2v2 Smooth Downsampling */
-EXTERN(void) jsimd_h2v2_smooth_downsample_mips_dspr2
-        (JSAMPARRAY input_data, JSAMPARRAY output_data,
-         JDIMENSION v_samp_factor, int max_v_samp_factor,
-         int smoothing_factor, JDIMENSION width_blocks,
-         JDIMENSION image_width);
+EXTERN(void) jsimd_h2v2_smooth_downsample_dspr2
+  (JSAMPARRAY input_data, JSAMPARRAY output_data, JDIMENSION v_samp_factor,
+   int max_v_samp_factor, int smoothing_factor, JDIMENSION width_in_blocks,
+   JDIMENSION image_width);
 
 
 /* Upsampling */
 EXTERN(void) jsimd_h2v1_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_h2v1_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_int_upsample_mips_dspr2
-        (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
-         int max_v_samp_factor);
+EXTERN(void) jsimd_h2v1_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_int_upsample_dspr2
+  (UINT8 h_expand, UINT8 v_expand, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr, JDIMENSION output_width,
+   int max_v_samp_factor);
 
 EXTERN(void) jsimd_h2v1_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
-         JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 /* Fancy Upsampling */
 EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 extern const int jconst_fancy_upsample_sse2[];
 EXTERN(void) jsimd_h2v1_fancy_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_sse2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+extern const int jconst_fancy_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
-EXTERN(void) jsimd_h2v1_fancy_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
-EXTERN(void) jsimd_h2v2_fancy_upsample_mips_dspr2
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_altivec
-        (int max_v_samp_factor, JDIMENSION downsampled_width,
-         JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 /* Merged Upsampling */
 EXTERN(void) jsimd_h2v1_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 extern const int jconst_merged_upsample_sse2[];
 EXTERN(void) jsimd_h2v1_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
-EXTERN(void) jsimd_h2v1_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+extern const int jconst_merged_upsample_avx2[];
+EXTERN(void) jsimd_h2v1_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
-EXTERN(void) jsimd_h2v2_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
-EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf, JSAMPLE* range);
+EXTERN(void) jsimd_h2v2_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf, JSAMPLE *range);
+
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v1_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 EXTERN(void) jsimd_h2v2_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_altivec
-        (JDIMENSION output_width, JSAMPIMAGE input_buf,
-         JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf);
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
 
 /* Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+
+EXTERN(void) jsimd_convsamp_avx2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_neon
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
-EXTERN(void) jsimd_convsamp_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+EXTERN(void) jsimd_convsamp_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 EXTERN(void) jsimd_convsamp_altivec
-        (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace);
 
 /* Floating Point Sample Conversion */
 EXTERN(void) jsimd_convsamp_float_3dnow
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_convsamp_float_sse2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
-EXTERN(void) jsimd_convsamp_float_mips_dspr2
-        (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_convsamp_float_dspr2
+  (JSAMPARRAY sample_data, JDIMENSION start_col, FAST_FLOAT *workspace);
 
-/* Slow Integer Forward DCT */
-EXTERN(void) jsimd_fdct_islow_mmx (DCTELEM *data);
+/* Accurate Integer Forward DCT */
+EXTERN(void) jsimd_fdct_islow_mmx(DCTELEM *data);
 
 extern const int jconst_fdct_islow_sse2[];
-EXTERN(void) jsimd_fdct_islow_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_sse2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_neon (DCTELEM *data);
+extern const int jconst_fdct_islow_avx2[];
+EXTERN(void) jsimd_fdct_islow_avx2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_neon(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_islow_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_islow_dspr2(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_islow_altivec(DCTELEM *data);
 
 /* Fast Integer Forward DCT */
-EXTERN(void) jsimd_fdct_ifast_mmx (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmx(DCTELEM *data);
 
 extern const int jconst_fdct_ifast_sse2[];
-EXTERN(void) jsimd_fdct_ifast_sse2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_sse2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_neon (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_mips_dspr2 (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
 
-EXTERN(void) jsimd_fdct_ifast_altivec (DCTELEM *data);
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
+EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
 
 /* Floating Point Forward DCT */
-EXTERN(void) jsimd_fdct_float_3dnow (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_3dnow(FAST_FLOAT *data);
 
 extern const int jconst_fdct_float_sse[];
-EXTERN(void) jsimd_fdct_float_sse (FAST_FLOAT *data);
+EXTERN(void) jsimd_fdct_float_sse(FAST_FLOAT *data);
 
 /* Quantization */
 EXTERN(void) jsimd_quantize_mmx
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_sse2
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_avx2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_neon
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
-EXTERN(void) jsimd_quantize_mips_dspr2
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+EXTERN(void) jsimd_quantize_dspr2
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+
+EXTERN(void) jsimd_quantize_mmi
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 EXTERN(void) jsimd_quantize_altivec
-        (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
+  (JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace);
 
 /* Floating Point Quantization */
 EXTERN(void) jsimd_quantize_float_3dnow
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 EXTERN(void) jsimd_quantize_float_sse2
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
-EXTERN(void) jsimd_quantize_float_mips_dspr2
-        (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
+EXTERN(void) jsimd_quantize_float_dspr2
+  (JCOEFPTR coef_block, FAST_FLOAT *divisors, FAST_FLOAT *workspace);
 
 /* Scaled Inverse DCT */
 EXTERN(void) jsimd_idct_2x2_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_red_sse2[];
 EXTERN(void) jsimd_idct_2x2_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_2x2_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 EXTERN(void) jsimd_idct_4x4_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_2x2_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
-EXTERN(void) jsimd_idct_4x4_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col, int *workspace);
-EXTERN(void) jsimd_idct_6x6_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
-EXTERN(void) jsimd_idct_12x12_pass1_mips_dspr2
-        (JCOEFPTR coef_block, void *dct_table, int *workspace);
-EXTERN(void) jsimd_idct_12x12_pass2_mips_dspr2
-        (int *workspace, int *output);
+EXTERN(void) jsimd_idct_2x2_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_4x4_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col, int *workspace);
+EXTERN(void) jsimd_idct_6x6_dspr2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+EXTERN(void) jsimd_idct_12x12_pass1_dspr2
+  (JCOEFPTR coef_block, void *dct_table, int *workspace);
+EXTERN(void) jsimd_idct_12x12_pass2_dspr2
+  (int *workspace, int *output);
 
-/* Slow Integer Inverse DCT */
+/* Accurate Integer Inverse DCT */
 EXTERN(void) jsimd_idct_islow_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_islow_sse2[];
 EXTERN(void) jsimd_idct_islow_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
+extern const int jconst_idct_islow_avx2[];
+EXTERN(void) jsimd_idct_islow_avx2
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_islow_mips_dspr2
-        (void *dct_table, JCOEFPTR coef_block, int *output_buf,
-         JSAMPLE *output_col);
+EXTERN(void) jsimd_idct_islow_dspr2
+  (void *dct_table, JCOEFPTR coef_block, int *output_buf, JSAMPLE *output_col);
+
+EXTERN(void) jsimd_idct_islow_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_islow_altivec
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Fast Integer Inverse DCT */
 EXTERN(void) jsimd_idct_ifast_mmx
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_ifast_sse2[];
 EXTERN(void) jsimd_idct_ifast_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_ifast_neon
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
-EXTERN(void) jsimd_idct_ifast_cols_mips_dspr2
-        (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
-         const int *idct_coefs);
-EXTERN(void) jsimd_idct_ifast_rows_mips_dspr2
-        (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
-         const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_cols_dspr2
+  (JCOEF *inptr, IFAST_MULT_TYPE *quantptr, DCTELEM *wsptr,
+   const int *idct_coefs);
+EXTERN(void) jsimd_idct_ifast_rows_dspr2
+  (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
+   const int *idct_coefs);
+
+EXTERN(void) jsimd_idct_ifast_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 EXTERN(void) jsimd_idct_ifast_altivec
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Floating Point Inverse DCT */
 EXTERN(void) jsimd_idct_float_3dnow
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse[];
 EXTERN(void) jsimd_idct_float_sse
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 extern const int jconst_idct_float_sse2[];
 EXTERN(void) jsimd_idct_float_sse2
-        (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
-         JDIMENSION output_col);
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
 
 /* Huffman coding */
 extern const int jconst_huff_encode_one_block[];
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_sse2
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
-EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
-        (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
-         c_derived_tbl *dctbl, c_derived_tbl *actbl);
+#ifndef NEON_INTRINSICS
+
+EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
+  (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+   c_derived_tbl *dctbl, c_derived_tbl *actbl);
+
+#endif
+
+/* Progressive Huffman encoding */
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/media/libjpeg/simd/jsimd_arm.c b/media/libjpeg/simd/jsimd_arm.c
deleted file mode 100644
index 61cd073e1b..0000000000
--- a/media/libjpeg/simd/jsimd_arm.c
+++ /dev/null
@@ -1,727 +0,0 @@
-/*
- * jsimd_arm.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
-  char *p;
-  if (*feature == 0)
-    return 0;
-  if (strncmp(buffer, "Features", 8) != 0)
-    return 0;
-  buffer += 8;
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'feature' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, feature))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(feature);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-  simd_support = 0;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_feature(buffer, "neon"))
-        simd_support |= JSIMD_ARM_NEON;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if !defined(__ARM_NEON__) && defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__ARM_NEON__)
-  simd_support |= JSIMD_ARM_NEON;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  /* We still have a chance to use NEON regardless of globally used
-   * -mcpu/-mfpu options passed to gcc by performing runtime detection via
-   * /proc/cpuinfo parsing on linux/android */
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_extrgbx_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      neonfct=jsimd_extbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_extbgrx_ycc_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_extxbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_extxrgb_ycc_convert_neon;
-      break;
-    default:
-      neonfct=jsimd_extrgb_ycc_convert_neon;
-      break;
-  }
-
-  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_ycc_extrgbx_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      neonfct=jsimd_ycc_extbgr_convert_neon;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_ycc_extbgrx_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_ycc_extxbgr_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_ycc_extxrgb_convert_neon;
-      break;
-    default:
-      neonfct=jsimd_ycc_extrgb_convert_neon;
-      break;
-  }
-
-  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_arm64.c b/media/libjpeg/simd/jsimd_arm64.c
deleted file mode 100644
index 09449bb6fd..0000000000
--- a/media/libjpeg/simd/jsimd_arm64.c
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * jsimd_arm64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015-2016, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit ARM architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-#define JSIMD_FASTLD3 1
-#define JSIMD_FASTST3 2
-#define JSIMD_FASTTBL 4
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
-                                    JSIMD_FASTTBL;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_cpuinfo (char *buffer, const char *field, char *value)
-{
-  char *p;
-  if (*value == 0)
-    return 0;
-  if (strncmp(buffer, field, strlen(field)) != 0)
-    return 0;
-  buffer += strlen(field);
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'value' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, value))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(value);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
-          check_cpuinfo(buffer, "CPU part", "0xd07"))
-        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
-           percent speedup by disabling the use of that instruction.  The
-           speedup on Cortex-A57 is more subtle but still measurable. */
-        simd_features &= ~JSIMD_FASTTBL;
-      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
-        /* The SIMD version of Huffman encoding is slower than the C version on
-           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
-           CPU. */
-        simd_huffman = simd_features = 0;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-
-/*
- * ARMv8 architectures support NEON extensions by default.
- * It is no longer optional as it was with ARMv7.
- */
-
-
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-  simd_support |= JSIMD_ARM_NEON;
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENEON");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ARM_NEON;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-  env = getenv("JSIMD_FASTLD3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_features |= JSIMD_FASTLD3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
-    simd_features &= ~JSIMD_FASTLD3;
-  env = getenv("JSIMD_FASTST3");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_features |= JSIMD_FASTST3;
-  if ((env != NULL) && (strcmp(env, "0") == 0))
-    simd_features &= ~JSIMD_FASTST3;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extrgb_ycc_convert_neon;
-      else
-        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_extrgbx_ycc_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extbgr_ycc_convert_neon;
-      else
-        neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_extbgrx_ycc_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_extxbgr_ycc_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_extxrgb_ycc_convert_neon;
-      break;
-    default:
-      if (simd_features & JSIMD_FASTLD3)
-        neonfct=jsimd_extrgb_ycc_convert_neon;
-      else
-        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
-      break;
-  }
-
-  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extrgb_convert_neon;
-      else
-        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      neonfct=jsimd_ycc_extrgbx_convert_neon;
-      break;
-    case JCS_EXT_BGR:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extbgr_convert_neon;
-      else
-        neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      neonfct=jsimd_ycc_extbgrx_convert_neon;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      neonfct=jsimd_ycc_extxbgr_convert_neon;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      neonfct=jsimd_ycc_extxrgb_convert_neon;
-      break;
-    default:
-      if (simd_features & JSIMD_FASTST3)
-        neonfct=jsimd_ycc_extrgb_convert_neon;
-      else
-        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
-      break;
-  }
-
-  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
-                                output_buf, num_rows);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_neon(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_neon(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_neon(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
-                      output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  if (simd_features & JSIMD_FASTTBL)
-    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
-                                            dctbl, actbl);
-  else
-    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
-                                                    last_dc_val, dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_arm_neon.S b/media/libjpeg/simd/jsimd_arm_neon.S
deleted file mode 100644
index cd2612724a..0000000000
--- a/media/libjpeg/simd/jsimd_arm_neon.S
+++ /dev/null
@@ -1,2878 +0,0 @@
-/*
- * ARMv7 NEON optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- * All Rights Reserved.
- * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
- * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
-#endif
-
-.text
-.fpu neon
-.arch armv7a
-.object_arch armv4
-.arm
-.syntax unified
-
-
-#define RESPECT_STRICT_ALIGNMENT 1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
-    .globl _\fname
-_\fname:
-#else
-    .global \fname
-#ifdef __ELF__
-    .hidden \fname
-    .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16         \x0, \x1
-    vtrn.16         \x2, \x3
-    vtrn.32         \x0, \x2
-    vtrn.32         \x1, \x3
-.endm
-
-
-#define CENTERJSAMPLE 128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
- *                        JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define FIX_0_298631336 (2446)
-#define FIX_0_390180644 (3196)
-#define FIX_0_541196100 (4433)
-#define FIX_0_765366865 (6270)
-#define FIX_0_899976223 (7373)
-#define FIX_1_175875602 (9633)
-#define FIX_1_501321110 (12299)
-#define FIX_1_847759065 (15137)
-#define FIX_1_961570560 (16069)
-#define FIX_2_053119869 (16819)
-#define FIX_2_562915447 (20995)
-#define FIX_3_072711026 (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
-{                                                                             \
-    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
-    JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
-    JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
-                                                                              \
-    /* 1-D iDCT input data */                                                 \
-    row0 = xrow0;                                                             \
-    row1 = xrow1;                                                             \
-    row2 = xrow2;                                                             \
-    row3 = xrow3;                                                             \
-    row4 = xrow4;                                                             \
-    row5 = xrow5;                                                             \
-    row6 = xrow6;                                                             \
-    row7 = xrow7;                                                             \
-                                                                              \
-    q5 = row7 + row3;                                                         \
-    q4 = row5 + row1;                                                         \
-    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
-         MULTIPLY(q4, FIX_1_175875602);                                       \
-    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
-         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
-    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
-         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
-    q4 = q6;                                                                  \
-    q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
-    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
-          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
-    /* now we can use q1 (reloadable constants have been used up) */          \
-    q1 = q3 + q2;                                                             \
-    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
-          MULTIPLY(row1, -FIX_0_899976223);                                   \
-    q5 = q7;                                                                  \
-    q1 = q1 + q6;                                                             \
-    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
-          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
-                                                                              \
-    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
-    tmp11_plus_tmp2 = q1;                                                     \
-    row1 = 0;                                                                 \
-                                                                              \
-    q1 = q1 - q6;                                                             \
-    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
-          MULTIPLY(row3, -FIX_2_562915447);                                   \
-    q1 = q1 - q6;                                                             \
-    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
-         MULTIPLY(row6, FIX_0_541196100);                                     \
-    q3 = q3 - q2;                                                             \
-                                                                              \
-    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
-    tmp11_minus_tmp2 = q1;                                                    \
-                                                                              \
-    q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
-    q2 = q1 + q6;                                                             \
-    q1 = q1 - q6;                                                             \
-                                                                              \
-    /* pick up the results */                                                 \
-    tmp0  = q4;                                                               \
-    tmp1  = q5;                                                               \
-    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
-    tmp3  = q7;                                                               \
-    tmp10 = q2;                                                               \
-    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
-    tmp12 = q3;                                                               \
-    tmp13 = q1;                                                               \
-}
-
-#define XFIX_0_899976223                   d0[0]
-#define XFIX_0_541196100                   d0[1]
-#define XFIX_2_562915447                   d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223 d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223 d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447 d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865  d1[2]
-#define XFIX_1_175875602                   d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644 d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065 d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447 d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560 d2[3]
-
-.balign 16
-jsimd_idct_islow_neon_consts:
-  .short FIX_0_899976223                    /* d0[0] */
-  .short FIX_0_541196100                    /* d0[1] */
-  .short FIX_2_562915447                    /* d0[2] */
-  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-  .short FIX_1_175875602                    /* d1[3] */
-  /* reloadable constants */
-  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
-
-asm_function jsimd_idct_islow_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    ROW0L           .req d16
-    ROW0R           .req d17
-    ROW1L           .req d18
-    ROW1R           .req d19
-    ROW2L           .req d20
-    ROW2R           .req d21
-    ROW3L           .req d22
-    ROW3R           .req d23
-    ROW4L           .req d24
-    ROW4R           .req d25
-    ROW5L           .req d26
-    ROW5R           .req d27
-    ROW6L           .req d28
-    ROW6R           .req d29
-    ROW7L           .req d30
-    ROW7R           .req d31
-
-    /* Load and dequantize coefficients into NEON registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_islow_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
-    add             ip, ip, #16
-    vmul.s16        q15, q15, q3
-    vpush           {d8-d15}                      /* save NEON registers */
-    /* 1-D IDCT, pass 1, left 4x8 half */
-    vadd.s16        d4, ROW7L, ROW3L
-    vadd.s16        d5, ROW5L, ROW1L
-    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d5, XFIX_1_175875602
-    vmull.s16       q7, d4, XFIX_1_175875602
-      /* Check for the zero coefficients in the right 4x8 half */
-      push            {r4, r5}
-    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW4L
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
-      orr             r0, r4, r5
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-      orr             r0, r0, r4
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q2
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      orr             r0, r0, r4
-    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1L, q1, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
-      orr             r0, r0, r4
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-      orr             r0, r0, r5
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    vmlal.s16       q6, ROW6L, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-      orr             r0, r0, r4
-    vrshrn.s32      ROW6L, q1, #11
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q5
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW4L
-      orr             r0, r0, r4
-    vrshrn.s32      ROW2L, q1, #11
-      orr             r0, r0, r5
-    vrshrn.s32      ROW5L, q3, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
-      orr             r0, r0, r4
-    vadd.s32        q2, q5, q6
-      orrs            r0, r0, r5
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-      orr             r0, r4, r5
-    vsub.s32        q3, q1, q4
-      pop             {r4, r5}
-    vrshrn.s32      ROW7L, q2, #11
-    vrshrn.s32      ROW3L, q5, #11
-    vrshrn.s32      ROW0L, q6, #11
-    vrshrn.s32      ROW4L, q3, #11
-
-      beq             3f  /* Go to do some special handling for the sparse
-                             right 4x8 half */
-
-    /* 1-D IDCT, pass 1, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]  /* reload constants */
-    vadd.s16        d10, ROW7R, ROW3R
-    vadd.s16        d8, ROW5R, ROW1R
-      /* Transpose left 4x8 half */
-      vtrn.16         ROW6L, ROW7L
-    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d8, XFIX_1_175875602
-      vtrn.16         ROW2L, ROW3L
-    vmull.s16       q7, d10, XFIX_1_175875602
-    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
-      vtrn.16         ROW0L, ROW1L
-    vsubl.s16       q3, ROW0R, ROW4R
-    vmull.s16       q2, ROW2R, XFIX_0_541196100
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-      vtrn.16         ROW4L, ROW5L
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
-      vtrn.32         ROW1L, ROW3L
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
-      vtrn.32         ROW4L, ROW6L
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      vtrn.32         ROW0L, ROW2L
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1R, q1, #11
-      vtrn.32         ROW5L, ROW7L
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vrshrn.s32      ROW6R, q1, #11
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0R, ROW4R
-    vrshrn.s32      ROW2R, q1, #11
-    vrshrn.s32      ROW5R, q3, #11
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vrshrn.s32      ROW7R, q2, #11
-    vrshrn.s32      ROW3R, q5, #11
-    vrshrn.s32      ROW0R, q6, #11
-    vrshrn.s32      ROW4R, q3, #11
-    /* Transpose right 4x8 half */
-    vtrn.16         ROW6R, ROW7R
-    vtrn.16         ROW2R, ROW3R
-    vtrn.16         ROW0R, ROW1R
-    vtrn.16         ROW4R, ROW5R
-    vtrn.32         ROW1R, ROW3R
-    vtrn.32         ROW4R, ROW6R
-    vtrn.32         ROW0R, ROW2R
-    vtrn.32         ROW5R, ROW7R
-
-1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5R, XFIX_1_175875602
-    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmull.s16       q7, ROW7R, XFIX_1_175875602
-    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-
-2:  /* Descale to 8-bit and range limit */
-    vqrshrn.s16     d16, q8, #2
-    vqrshrn.s16     d17, q9, #2
-    vqrshrn.s16     d18, q10, #2
-    vqrshrn.s16     d19, q11, #2
-    vpop            {d8-d15}                      /* restore NEON registers */
-    vqrshrn.s16     d20, q12, #2
-      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-      vtrn.16         q8, q9
-    vqrshrn.s16     d21, q13, #2
-    vqrshrn.s16     d22, q14, #2
-      vmov.u8         q0, #(CENTERJSAMPLE)
-    vqrshrn.s16     d23, q15, #2
-      vtrn.8          d16, d17
-      vtrn.8          d18, d19
-      vadd.u8         q8, q8, q0
-      vadd.u8         q9, q9, q0
-      vtrn.16         q10, q11
-        /* Store results to the output buffer */
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d16}, [TMP1]
-      vtrn.8          d20, d21
-        vst1.8          {d17}, [TMP2]
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d18}, [TMP1]
-      vadd.u8         q10, q10, q0
-        vst1.8          {d19}, [TMP2]
-        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        add             TMP3, TMP3, OUTPUT_COL
-        add             TMP4, TMP4, OUTPUT_COL
-      vtrn.8          d22, d23
-        vst1.8          {d20}, [TMP1]
-      vadd.u8         q11, q11, q0
-        vst1.8          {d21}, [TMP2]
-        vst1.8          {d22}, [TMP3]
-        vst1.8          {d23}, [TMP4]
-    bx              lr
-
-3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
-
-    /* Transpose left 4x8 half */
-    vtrn.16         ROW6L, ROW7L
-    vtrn.16         ROW2L, ROW3L
-    vtrn.16         ROW0L, ROW1L
-    vtrn.16         ROW4L, ROW5L
-    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
-    vtrn.32         ROW1L, ROW3L
-    vtrn.32         ROW4L, ROW6L
-    vtrn.32         ROW0L, ROW2L
-    vtrn.32         ROW5L, ROW7L
-
-    cmp             r0, #0
-    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
-                           pass */
-
-    /* Only row 0 is non-zero for the right 4x8 half  */
-    vdup.s16        ROW1R, ROW0R[1]
-    vdup.s16        ROW2R, ROW0R[2]
-    vdup.s16        ROW3R, ROW0R[3]
-    vdup.s16        ROW4R, ROW0R[0]
-    vdup.s16        ROW5R, ROW0R[1]
-    vdup.s16        ROW6R, ROW0R[2]
-    vdup.s16        ROW7R, ROW0R[3]
-    vdup.s16        ROW0R, ROW0R[0]
-    b               1b  /* Go to 'normal' second pass */
-
-4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vshll.s16       q3, ROW0L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW0L, #13
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5L, XFIX_1_175875602
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW7L, XFIX_1_175875602
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW6L, XFIX_0_541196100
-    vshll.s16       q3, ROW4L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW4L, #13
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-    b               2b                            /* Go to epilogue */
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-    .unreq          ROW0L
-    .unreq          ROW0R
-    .unreq          ROW1L
-    .unreq          ROW1R
-    .unreq          ROW2L
-    .unreq          ROW2R
-    .unreq          ROW3L
-    .unreq          ROW3R
-    .unreq          ROW4L
-    .unreq          ROW4R
-    .unreq          ROW5L
-    .unreq          ROW5R
-    .unreq          ROW6L
-    .unreq          ROW6R
-    .unreq          ROW7L
-    .unreq          ROW7R
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in ARM NEON case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200 d0[0]
-#define XFIX_1_414213562 d0[1]
-#define XFIX_1_847759065 d0[2]
-#define XFIX_2_613125930 d0[3]
-
-.balign 16
-jsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    /* Load and dequantize coefficients into NEON registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_ifast_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0}, [ip, :64]  /* load constants */
-    vmul.s16        q15, q15, q3
-    vpush           {d8-d13}         /* save NEON registers */
-    /* 1-D IDCT, pass 1 */
-    vsub.s16        q2, q10, q14
-    vadd.s16        q14, q10, q14
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vadd.s16        q10, q10, q2
-      /* Transpose */
-      vtrn.16         q8, q9
-    vsub.s16        q11, q12, q1
-      vtrn.16         q14, q15
-    vadd.s16        q12, q12, q1
-      vtrn.16         q10, q11
-      vtrn.16         q12, q13
-      vtrn.32         q9, q11
-      vtrn.32         q12, q14
-      vtrn.32         q8, q10
-      vtrn.32         q13, q15
-      vswp            d28, d21
-      vswp            d26, d19
-    /* 1-D IDCT, pass 2 */
-    vsub.s16        q2, q10, q14
-      vswp            d30, d23
-    vadd.s16        q14, q10, q14
-      vswp            d24, d17
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vpop            {d8-d13}      /* restore NEON registers */
-    vadd.s16        q10, q10, q2
-    vsub.s16        q11, q12, q1
-    vadd.s16        q12, q12, q1
-    /* Descale to 8-bit and range limit */
-    vmov.u8         q0, #0x80
-    vqshrn.s16      d16, q8, #5
-    vqshrn.s16      d17, q9, #5
-    vqshrn.s16      d18, q10, #5
-    vqshrn.s16      d19, q11, #5
-    vqshrn.s16      d20, q12, #5
-    vqshrn.s16      d21, q13, #5
-    vqshrn.s16      d22, q14, #5
-    vqshrn.s16      d23, q15, #5
-    vadd.u8         q8, q8, q0
-    vadd.u8         q9, q9, q0
-    vadd.u8         q10, q10, q0
-    vadd.u8         q11, q11, q0
-    /* Transpose the final 8-bit samples */
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.8          d16, d17
-    vtrn.8          d18, d19
-      /* Store results to the output buffer */
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d16}, [TMP1]
-      vst1.8          {d17}, [TMP2]
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d18}, [TMP1]
-    vtrn.8          d20, d21
-      vst1.8          {d19}, [TMP2]
-      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      add             TMP3, TMP3, OUTPUT_COL
-      add             TMP4, TMP4, OUTPUT_COL
-      vst1.8          {d20}, [TMP1]
-    vtrn.8          d22, d23
-      vst1.8          {d21}, [TMP2]
-      vst1.8          {d22}, [TMP3]
-      vst1.8          {d23}, [TMP4]
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
-
-.balign 16
-jsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065      /* d0[0] */
-  .short -FIX_0_765366865     /* d0[1] */
-  .short -FIX_0_211164243     /* d0[2] */
-  .short FIX_1_451774981      /* d0[3] */
-  .short -FIX_2_172734803     /* d1[0] */
-  .short FIX_1_061594337      /* d1[1] */
-  .short -FIX_0_509795579     /* d1[2] */
-  .short -FIX_0_601344887     /* d1[3] */
-  .short FIX_0_899976223      /* d2[0] */
-  .short FIX_2_562915447      /* d2[1] */
-  .short 1 << (CONST_BITS+1)  /* d2[2] */
-  .short 0                    /* d2[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4, d2[2]
-    vmlal.s16       q14, \x8, d0[0]
-    vmlal.s16       q14, \x14, d0[1]
-
-    vmull.s16       q13, \x16, d1[2]
-    vmlal.s16       q13, \x12, d1[3]
-    vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6, d2[1]
-
-    vmull.s16       q15, \x4, d2[2]
-    vmlsl.s16       q15, \x8, d0[0]
-    vmlsl.s16       q15, \x14, d0[1]
-
-    vmull.s16       q12, \x16, d0[2]
-    vmlal.s16       q12, \x12, d0[3]
-    vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6, d1[1]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y29, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y29, q14, #\shift
-  .endif
-
-    vadd.s32        q10, q15, q12
-    vsub.s32        q15, q15, q12
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q15, q15, #\shift
-    vmovn.s32       \y27, q10
-    vmovn.s32       \y28, q15
-  .else
-    vrshrn.s32      \y27, q10, #\shift
-    vrshrn.s32      \y28, q15, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    vpush           {d8-d15}
-
-    /* Load constants (d3 is just used for padding) */
-    adr             TMP4, jsimd_idct_4x4_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | d8      | d9
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | d14     | d15
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
-    add COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q3, q3, q10
-    vmul.s16        q4, q4, q11
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    vmul.s16        q6, q6, q13
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q7, q7, q14
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
-    transpose_4x4   d5, d7, d9, d11
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
-    transpose_4x4   d26, d27, d28, d29
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vadd.s16        q14, q14, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q14
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    vst1.32         {d26[0]}, [TMP1]!
-    vst1.32         {d27[0]}, [TMP3]!
-    vst1.32         {d26[1]}, [TMP2]!
-    vst1.32         {d27[1]}, [TMP4]!
-#else
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[0]}, [TMP3]!
-    vst1.8          {d26[1]}, [TMP1]!
-    vst1.8          {d27[1]}, [TMP3]!
-    vst1.8          {d26[2]}, [TMP1]!
-    vst1.8          {d27[2]}, [TMP3]!
-    vst1.8          {d26[3]}, [TMP1]!
-    vst1.8          {d27[3]}, [TMP3]!
-
-    vst1.8          {d26[4]}, [TMP2]!
-    vst1.8          {d27[4]}, [TMP4]!
-    vst1.8          {d26[5]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP4]!
-    vst1.8          {d26[6]}, [TMP2]!
-    vst1.8          {d27[6]}, [TMP4]!
-    vst1.8          {d26[7]}, [TMP2]!
-    vst1.8          {d27[7]}, [TMP4]!
-#endif
-
-    vpop            {d8-d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular NEON optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* d0[0] */
-  .short FIX_0_850430095   /* d0[1] */
-  .short -FIX_1_272758580  /* d0[2] */
-  .short FIX_3_624509785   /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16       q14, \x4, #15
-    vmull.s16       q13, \x6, d0[3]
-    vmlal.s16       q13, \x10, d0[2]
-    vmlal.s16       q13, \x12, d0[1]
-    vmlal.s16       q13, \x16, d0[0]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y27, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y27, q14, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req ip
-
-    vpush           {d8-d15}
-
-    /* Load constants */
-    adr             TMP2, jsimd_idct_2x2_neon_consts
-    vld1.16         {d0}, [TMP2, :64]
-
-    /* Load all COEF_BLOCK into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | -       | -
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | -       | -
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* Dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vmul.s16        q3, q3, q10
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
-    vmul.s16        q6, q6, q13
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-#if 0
-    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9, d11
-#else
-    vmull.s16       q13, d6, d0[3]
-    vmlal.s16       q13, d10, d0[2]
-    vmlal.s16       q13, d12, d0[1]
-    vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7, d0[3]
-    vmlal.s16       q12, d11, d0[2]
-    vmlal.s16       q12, d13, d0[1]
-    vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4, #15
-    vshll.s16       q15, d5, #15
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-    vrshrn.s32      d4, q10, #13
-    vrshrn.s32      d6, q14, #13
-    vadd.s32        q10, q15, q12
-    vsub.s32        q14, q15, q12
-    vrshrn.s32      d5, q10, #13
-    vrshrn.s32      d7, q14, #13
-    vtrn.16         q2, q3
-    vtrn.32         q3, q5
-#endif
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q13
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[4]}, [TMP1]!
-    vst1.8          {d26[1]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP2]!
-
-    vpop            {d8-d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_extrgb_convert_neon
- * jsimd_ycc_extbgr_convert_neon
- * jsimd_ycc_extrgbx_convert_neon
- * jsimd_ycc_extbgrx_convert_neon
- * jsimd_ycc_extxbgr_convert_neon
- * jsimd_ycc_extxrgb_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-
-.macro do_load size
-  .if \size == 8
-    vld1.8          {d4}, [U, :64]!
-    vld1.8          {d5}, [V, :64]!
-    vld1.8          {d0}, [Y, :64]!
-    pld             [U, #64]
-    pld             [V, #64]
-    pld             [Y, #64]
-  .elseif \size == 4
-    vld1.8          {d4[0]}, [U]!
-    vld1.8          {d4[1]}, [U]!
-    vld1.8          {d4[2]}, [U]!
-    vld1.8          {d4[3]}, [U]!
-    vld1.8          {d5[0]}, [V]!
-    vld1.8          {d5[1]}, [V]!
-    vld1.8          {d5[2]}, [V]!
-    vld1.8          {d5[3]}, [V]!
-    vld1.8          {d0[0]}, [Y]!
-    vld1.8          {d0[1]}, [Y]!
-    vld1.8          {d0[2]}, [Y]!
-    vld1.8          {d0[3]}, [Y]!
-  .elseif \size == 2
-    vld1.8          {d4[4]}, [U]!
-    vld1.8          {d4[5]}, [U]!
-    vld1.8          {d5[4]}, [V]!
-    vld1.8          {d5[5]}, [V]!
-    vld1.8          {d0[4]}, [Y]!
-    vld1.8          {d0[5]}, [Y]!
-  .elseif \size == 1
-    vld1.8          {d4[6]}, [U]!
-    vld1.8          {d5[6]}, [V]!
-    vld1.8          {d0[6]}, [Y]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_store bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vst3.8        {d10, d11, d12}, [RGB]!
-    .elseif \size == 4
-      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vst4.8        {d10, d11, d12, d13}, [RGB]!
-    .elseif \size == 4
-      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 16
-    .if \size == 8
-      vst1.16       {q15}, [RGB]!
-    .elseif \size == 4
-      vst1.16       {d30}, [RGB]!
-    .elseif \size == 2
-      vst1.16       {d31[0]}, [RGB]!
-      vst1.16       {d31[1]}, [RGB]!
-    .elseif \size == 1
-      vst1.16       {d31[2]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
-    vrshrn.s32      d20, q10, #15
-    vrshrn.s32      d21, q11, #15
-    vrshrn.s32      d24, q12, #14
-    vrshrn.s32      d25, q13, #14
-    vrshrn.s32      d28, q14, #14
-    vrshrn.s32      d29, q15, #14
-    vaddw.u8        q11, q10, d0
-    vaddw.u8        q12, q12, d0
-    vaddw.u8        q14, q14, d0
-  .if \bpp != 16
-    vqmovun.s16     d1\g_offs, q11
-    vqmovun.s16     d1\r_offs, q12
-    vqmovun.s16     d1\b_offs, q14
-  .else  /* rgb565 */
-    vqshlu.s16      q13, q11, #8
-    vqshlu.s16      q15, q12, #8
-    vqshlu.s16      q14, q14, #8
-    vsri.u16        q15, q13, #5
-    vsri.u16        q15, q14, #11
-  .endif
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-                                       /* "do_yuv_to_rgb_stage2" and "store" */
-                                       vrshrn.s32      d20, q10, #15
-    /* "load" and "do_yuv_to_rgb_stage1" */
-    pld             [U, #64]
-                                       vrshrn.s32      d21, q11, #15
-    pld             [V, #64]
-                                       vrshrn.s32      d24, q12, #14
-                                       vrshrn.s32      d25, q13, #14
-    vld1.8          {d4}, [U, :64]!
-                                       vrshrn.s32      d28, q14, #14
-    vld1.8          {d5}, [V, :64]!
-                                       vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-                                       vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-                                       vaddw.u8        q12, q12, d0
-                                       vaddw.u8        q14, q14, d0
-  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
-                                       vqmovun.s16     d1\g_offs, q11
-    pld             [Y, #64]
-                                       vqmovun.s16     d1\r_offs, q12
-    vld1.8          {d0}, [Y, :64]!
-                                       vqmovun.s16     d1\b_offs, q14
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-                                       do_store        \bpp, 8
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-  .else  /**************************** rgb565 ********************************/
-                                       vqshlu.s16      q13, q11, #8
-    pld             [Y, #64]
-                                       vqshlu.s16      q15, q12, #8
-                                       vqshlu.s16      q14, q14, #8
-    vld1.8          {d0}, [Y, :64]!
-    vmull.s16       q11, d7, d1[1]
-    vmlal.s16       q11, d9, d1[2]
-                                       vsri.u16        q15, q13, #5
-    vmull.s16       q12, d8, d1[0]
-                                       vsri.u16        q15, q14, #11
-    vmull.s16       q13, d9, d1[0]
-    vmull.s16       q14, d6, d1[3]
-                                       do_store        \bpp, 8
-    vmull.s16       q15, d7, d1[3]
-  .endif
-.endm
-
-.macro do_yuv_to_rgb
-    do_yuv_to_rgb_stage1
-    do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    INPUT_ROW       .req r2
-    OUTPUT_BUF      .req r3
-    NUM_ROWS        .req r4
-
-    INPUT_BUF0      .req r5
-    INPUT_BUF1      .req r6
-    INPUT_BUF2      .req INPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
-    adr             ip, jsimd_ycc_\colorid\()_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save ARM registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, #4]
-    ldr             INPUT_BUF2, [INPUT_BUF, #8]
-    .unreq          INPUT_BUF
-
-    /* Save NEON registers */
-    vpush           {d8-d15}
-
-    /* Initially set d10, d11, d12, d13 to 0xFF */
-    vmov.u8         q5, #255
-    vmov.u8         q6, #255
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
-    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
-    add             INPUT_ROW, INPUT_ROW, #1
-    ldr             RGB, [OUTPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         8
-    do_yuv_to_rgb_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_yuv_to_rgb_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         1
-5:
-    do_yuv_to_rgb
-    tst             N, #4
-    beq             6f
-    do_store        \bpp, 4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        \bpp, 2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        \bpp, 1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8-d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          INPUT_ROW
-    .unreq          OUTPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          INPUT_BUF0
-    .unreq          INPUT_BUF1
-    .unreq          INPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
-  .if \size == 8
-    vst1.8          {d20}, [Y]!
-    vst1.8          {d21}, [U]!
-    vst1.8          {d22}, [V]!
-  .elseif \size == 4
-    vst1.8          {d20[0]}, [Y]!
-    vst1.8          {d20[1]}, [Y]!
-    vst1.8          {d20[2]}, [Y]!
-    vst1.8          {d20[3]}, [Y]!
-    vst1.8          {d21[0]}, [U]!
-    vst1.8          {d21[1]}, [U]!
-    vst1.8          {d21[2]}, [U]!
-    vst1.8          {d21[3]}, [U]!
-    vst1.8          {d22[0]}, [V]!
-    vst1.8          {d22[1]}, [V]!
-    vst1.8          {d22[2]}, [V]!
-    vst1.8          {d22[3]}, [V]!
-  .elseif \size == 2
-    vst1.8          {d20[4]}, [Y]!
-    vst1.8          {d20[5]}, [Y]!
-    vst1.8          {d21[4]}, [U]!
-    vst1.8          {d21[5]}, [U]!
-    vst1.8          {d22[4]}, [V]!
-    vst1.8          {d22[5]}, [V]!
-  .elseif \size == 1
-    vst1.8          {d20[6]}, [Y]!
-    vst1.8          {d21[6]}, [U]!
-    vst1.8          {d22[6]}, [V]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_load bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vld3.8        {d10, d11, d12}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vld4.8        {d10, d11, d12, d13}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vrev64.32       q9, q1
-    vrev64.32       q13, q1
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.macro do_rgb_to_yuv_stage2
-    vrshrn.u32      d20, q7, #16
-    vrshrn.u32      d21, q8, #16
-    vshrn.u32       d22, q9, #16
-    vshrn.u32       d23, q13, #16
-    vshrn.u32       d24, q14, #16
-    vshrn.u32       d25, q15, #16
-    vmovn.u16       d20, q10       /* d20 = y */
-    vmovn.u16       d21, q11       /* d21 = u */
-    vmovn.u16       d22, q12       /* d22 = v */
-.endm
-
-.macro do_rgb_to_yuv
-    do_rgb_to_yuv_stage1
-    do_rgb_to_yuv_stage2
-.endm
-
-.macro do_rgb_to_yuv_stage2_store_load_stage1
-      vrshrn.u32      d20, q7, #16
-      vrshrn.u32      d21, q8, #16
-      vshrn.u32       d22, q9, #16
-    vrev64.32       q9, q1
-      vshrn.u32       d23, q13, #16
-    vrev64.32       q13, q1
-      vshrn.u32       d24, q14, #16
-      vshrn.u32       d25, q15, #16
-    do_load         \bpp, 8
-      vmovn.u16       d20, q10     /* d20 = y */
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-      vmovn.u16       d21, q11     /* d21 = u */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-      vmovn.u16       d22, q12     /* d22 = v */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-      vst1.8          {d20}, [Y]!
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-      vst1.8          {d21}, [U]!
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-      vst1.8          {d22}, [V]!
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.balign 16
-jsimd_\colorid\()_ycc_neon_consts:
-  .short 19595, 38470, 7471,  11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128,   32767, 128
-  .short 32767, 128,   32767, 128
-
-asm_function jsimd_\colorid\()_ycc_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_ROW      .req r3
-    NUM_ROWS        .req r4
-
-    OUTPUT_BUF0     .req r5
-    OUTPUT_BUF1     .req r6
-    OUTPUT_BUF2     .req OUTPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d0, d1, d2, d3 */
-    adr             ip, jsimd_\colorid\()_ycc_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save ARM registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
-    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
-    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
-    .unreq          OUTPUT_BUF
-
-    /* Save NEON registers */
-    vpush           {d8-d15}
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
-    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
-    add             OUTPUT_ROW, OUTPUT_ROW, #1
-    ldr             RGB, [INPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         \bpp, 8
-    do_rgb_to_yuv_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_rgb_to_yuv_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_rgb_to_yuv_stage2
-    do_store        8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         \bpp, 4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         \bpp, 2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         \bpp, 1
-5:
-    do_rgb_to_yuv
-    tst             N, #4
-    beq             6f
-    do_store        4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8-d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          OUTPUT_ROW
-    .unreq          INPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          OUTPUT_BUF0
-    .unreq          OUTPUT_BUF1
-    .unreq          OUTPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req r0
-    START_COL       .req r1
-    WORKSPACE       .req r2
-    TMP1            .req r3
-    TMP2            .req r4
-    TMP3            .req r5
-    TMP4            .req ip
-
-    push            {r4, r5}
-    vmov.u8         d0, #128
-
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d16}, [TMP1]
-    vsubl.u8        q8, d16, d0
-    vld1.8          {d18}, [TMP2]
-    vsubl.u8        q9, d18, d0
-    vld1.8          {d20}, [TMP3]
-    vsubl.u8        q10, d20, d0
-    vld1.8          {d22}, [TMP4]
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    vsubl.u8        q11, d22, d0
-    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d24}, [TMP1]
-    vsubl.u8        q12, d24, d0
-    vld1.8          {d26}, [TMP2]
-    vsubl.u8        q13, d26, d0
-    vld1.8          {d28}, [TMP3]
-    vsubl.u8        q14, d28, d0
-    vld1.8          {d30}, [TMP4]
-    vsubl.u8        q15, d30, d0
-    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
-    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
-    pop             {r4, r5}
-    bx              lr
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#define XFIX_0_382683433 d0[0]
-#define XFIX_0_541196100 d0[1]
-#define XFIX_0_707106781 d0[2]
-#define XFIX_1_306562965 d0[3]
-
-.balign 16
-jsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req r0
-    TMP             .req ip
-
-    vpush           {d8-d15}
-
-    /* Load constants */
-    adr             TMP, jsimd_fdct_ifast_neon_consts
-    vld1.16         {d0}, [TMP, :64]
-
-    /* Load all DATA into NEON registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | q8
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
-    sub             DATA, DATA, #(128 - 32)
-
-    mov             TMP, #2
-1:
-    /* Transpose */
-    vtrn.16         q12, q13
-    vtrn.16         q10, q11
-    vtrn.16         q8, q9
-    vtrn.16         q14, q15
-    vtrn.32         q9, q11
-    vtrn.32         q13, q15
-    vtrn.32         q8, q10
-    vtrn.32         q12, q14
-    vswp            d30, d23
-    vswp            d24, d17
-    vswp            d26, d19
-      /* 1-D FDCT */
-      vadd.s16        q2, q11, q12
-    vswp            d28, d21
-      vsub.s16        q12, q11, q12
-      vsub.s16        q6, q10, q13
-      vadd.s16        q10, q10, q13
-      vsub.s16        q7, q9, q14
-      vadd.s16        q9, q9, q14
-      vsub.s16        q1, q8, q15
-      vadd.s16        q8, q8, q15
-      vsub.s16        q4, q9, q10
-      vsub.s16        q5, q8, q2
-      vadd.s16        q3, q9, q10
-      vadd.s16        q4, q4, q5
-      vadd.s16        q2, q8, q2
-      vqdmulh.s16     q4, q4, XFIX_0_707106781
-      vadd.s16        q11, q12, q6
-      vadd.s16        q8, q2, q3
-      vsub.s16        q12, q2, q3
-      vadd.s16        q3, q6, q7
-      vadd.s16        q7, q7, q1
-      vqdmulh.s16     q3, q3, XFIX_0_707106781
-      vsub.s16        q6, q11, q7
-      vadd.s16        q10, q5, q4
-      vqdmulh.s16     q6, q6, XFIX_0_382683433
-      vsub.s16        q14, q5, q4
-      vqdmulh.s16     q11, q11, XFIX_0_541196100
-      vqdmulh.s16     q5, q7, XFIX_1_306562965
-      vadd.s16        q4, q1, q3
-      vsub.s16        q3, q1, q3
-      vadd.s16        q7, q7, q6
-      vadd.s16        q11, q11, q6
-      vadd.s16        q7, q7, q5
-      vadd.s16        q13, q3, q11
-      vsub.s16        q11, q3, q11
-      vadd.s16        q9, q4, q7
-      vsub.s16        q15, q4, q7
-    subs            TMP, TMP, #1
-    bne             1b
-
-    /* store results */
-    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
-
-    vpop            {d8-d15}
-    bx              lr
-
-    .unreq          DATA
-    .unreq          TMP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
- *                      DCTELEM *workspace);
- *
- * Note: the code uses 2 stage pipelining in order to improve instructions
- *       scheduling and eliminate stalls (this provides ~15% better
- *       performance for this function on both ARM Cortex-A8 and
- *       ARM Cortex-A9 when compared to the non-pipelined variant).
- *       The instructions which belong to the second stage use different
- *       indentation for better readiability.
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req r0
-    DIVISORS        .req r1
-    WORKSPACE       .req r2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req r3
-    SHIFT           .req ip
-    LOOP_COUNT      .req r4
-
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-    vabs.s16        q12, q0
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-
-    push            {r4, r5}
-    mov             LOOP_COUNT, #3
-1:
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-      veor.u16        q14, q14, q2  /* restore sign */
-    vabs.s16        q12, q0
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-      veor.u16        q15, q15, q3
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-      vsub.u16        q14, q14, q2
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-      vsub.u16        q15, q15, q3
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    bne             1b
-    pop             {r4, r5}
-
-      veor.u16        q14, q14, q2  /* restore sign */
-      veor.u16        q15, q15, q3
-      vsub.u16        q14, q14, q2
-      vsub.u16        q15, q15, q3
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-
-    bx              lr  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
- *                                 JDIMENSION downsampled_width,
- *                                 JSAMPARRAY input_data,
- *                                 JSAMPARRAY *output_data_ptr);
- *
- * Note: the use of unaligned writes is the main remaining bottleneck in
- *       this code, which can be potentially solved to get up to tens
- *       of percents performance improvement on Cortex-A8/Cortex-A9.
- */
-
-/*
- * Upsample 16 source pixels to 32 destination pixels. The new 16 source
- * pixels are loaded to q0. The previous 16 source pixels are in q1. The
- * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
- * Register d28 is used for multiplication by 3. Register q15 is used
- * for adding +1 bias.
- */
-.macro upsample16 OUTPTR, INPTR
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vmov            q1, q0        /* backup source pixels to q1 */
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
- * macro, the roles of q0 and q1 registers are reversed for even and odd
- * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
- * Also this unrolling allows to reorder loads and stores to compensate
- * multiplication latency and reduce stalls.
- */
-.macro upsample32 OUTPTR, INPTR
-    /* even 16 pixels group */
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-      /* odd 16 pixels group */
-      vld1.8          {q1}, [\INPTR]!
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-      vmovl.u8        q8, d2
-      vext.8          q2, q0, q1, #15
-      vmovl.u8        q9, d3
-      vaddw.u8        q10, q15, d4
-      vaddw.u8        q11, q15, d5
-      vmlal.u8        q8, d4, d28
-      vmlal.u8        q9, d5, d28
-      vmlal.u8        q10, d2, d28
-      vmlal.u8        q11, d3, d28
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-      vrshrn.u16      d6, q8, #2
-      vrshrn.u16      d7, q9, #2
-      vshrn.u16       d8, q10, #2
-      vshrn.u16       d9, q11, #2
-      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
- */
-.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
-    /* special case for the first and last pixels */
-    sub             \WIDTH, \WIDTH, #1
-    add             \OUTPTR, \OUTPTR, #1
-    ldrb            \TMP1, [\INPTR, \WIDTH]
-    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
-    ldrb            \TMP1, [\INPTR], #1
-    strb            \TMP1, [\OUTPTR, #-1]
-    vmov.8          d3[7], \TMP1
-
-    subs            \WIDTH, \WIDTH, #32
-    blt             5f
-0:  /* process 32 pixels per iteration */
-    upsample32      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #32
-    bge             0b
-5:
-    adds            \WIDTH, \WIDTH, #16
-    blt             1f
-0:  /* process 16 pixels if needed */
-    upsample16      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #16
-1:
-    adds            \WIDTH, \WIDTH, #16
-    beq             9f
-
-    /* load the remaining 1-15 pixels */
-    add             \INPTR, \INPTR, \WIDTH
-    tst             \WIDTH, #1
-    beq             2f
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vext.8          d0, d0, d0, #6
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vrev64.32       d0, d0
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[3]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[2]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #8
-    beq             2f
-    vmov            d1, d0
-    sub             \INPTR, \INPTR, #8
-    vld1.8          {d0}, [\INPTR]
-2:  /* upsample the remaining pixels */
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vrshrn.u16      d10, q8, #2
-    vrshrn.u16      d12, q9, #2
-    vshrn.u16       d11, q10, #2
-    vshrn.u16       d13, q11, #2
-    vzip.8          d10, d11
-    vzip.8          d12, d13
-    /* store the remaining pixels */
-    tst             \WIDTH, #8
-    beq             2f
-    vst1.8          {d10, d11}, [\OUTPTR]!
-    vmov            q5, q6
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vst1.8          {d10}, [\OUTPTR]!
-    vmov            d10, d11
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-    vst1.8          {d10[2]}, [\OUTPTR]!
-    vst1.8          {d10[3]}, [\OUTPTR]!
-    vext.8          d10, d10, d10, #4
-2:
-    tst             \WIDTH, #1
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-2:
-9:
-.endm
-
-asm_function jsimd_h2v1_fancy_upsample_neon
-
-    MAX_V_SAMP_FACTOR .req r0
-    DOWNSAMPLED_WIDTH .req r1
-    INPUT_DATA        .req r2
-    OUTPUT_DATA_PTR   .req r3
-    OUTPUT_DATA       .req OUTPUT_DATA_PTR
-
-    OUTPTR            .req r4
-    INPTR             .req r5
-    WIDTH             .req ip
-    TMP               .req lr
-
-    push            {r4, r5, r6, lr}
-    vpush           {d8-d15}
-
-    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
-    cmp             MAX_V_SAMP_FACTOR, #0
-    ble             99f
-
-    /* initialize constants */
-    vmov.u8         d28, #3
-    vmov.u16        q15, #1
-11:
-    ldr             INPTR, [INPUT_DATA], #4
-    ldr             OUTPTR, [OUTPUT_DATA], #4
-    mov             WIDTH, DOWNSAMPLED_WIDTH
-    upsample_row    OUTPTR, INPTR, WIDTH, TMP
-    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
-    bgt             11b
-
-99:
-    vpop            {d8-d15}
-    pop             {r4, r5, r6, pc}
-
-    .unreq          MAX_V_SAMP_FACTOR
-    .unreq          DOWNSAMPLED_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA_PTR
-    .unreq          OUTPUT_DATA
-
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          WIDTH
-    .unreq          TMP
-
-.purgem upsample16
-.purgem upsample32
-.purgem upsample_row
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET*)
- * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
- *                              JCOEFPTR block, int last_dc_val,
- *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
-.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-    sub             \PUT_BITS, \PUT_BITS, #0x8
-    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
-    uxtb            \TMP, \TMP
-    strb            \TMP, [\BUFFER, #1]!
-    cmp             \TMP, #0xff
-    /*it eq*/
-    strbeq          \ZERO, [\BUFFER, #1]!
-.endm
-
-.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
-    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
-    add             \PUT_BITS, \SIZE
-    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
-    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
-.endm
-
-.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-  cmp               \PUT_BITS, #0x10
-  blt               15f
-    eor               \ZERO, \ZERO, \ZERO
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-15:
-.endm
-
-.balign 16
-jsimd_huff_encode_one_block_neon_consts:
-  .byte 0x01
-  .byte 0x02
-  .byte 0x04
-  .byte 0x08
-  .byte 0x10
-  .byte 0x20
-  .byte 0x40
-  .byte 0x80
-
-asm_function jsimd_huff_encode_one_block_neon
-    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-    add             r7, sp, #0x1c
-    sub             r4, sp, #0x40
-    bfc             r4, #0, #5
-    mov             sp, r4           /* align sp on 32 bytes */
-    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vst1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             sp, #0x140       /* reserve 320 bytes */
-    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
-    add             r4, sp, #0x20    /* r4 = t1 */
-    ldr             lr, [r7, #0x8]   /* lr = dctbl */
-    sub             r10, r1, #0x1    /* r10=buffer-- */
-    ldrsh           r1, [r2]
-    mov             r9, #0x10
-    mov             r8, #0x1
-    adr             r5, jsimd_huff_encode_one_block_neon_consts
-    /* prepare data */
-    vld1.8          {d26}, [r5, :64]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    vdup.16         q14, r9
-    vdup.16         q15, r8
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    sub             r1, r1, r3
-    add             r9, r2, #0x22
-    add             r8, r2, #0x18
-    add             r3, r2, #0x36
-    vmov.16         d0[0], r1
-    vld1.16         {d2[0]}, [r9, :16]
-    vld1.16         {d4[0]}, [r8, :16]
-    vld1.16         {d6[0]}, [r3, :16]
-    add             r1, r2, #0x2
-    add             r9, r2, #0x30
-    add             r8, r2, #0x26
-    add             r3, r2, #0x28
-    vld1.16         {d0[1]}, [r1, :16]
-    vld1.16         {d2[1]}, [r9, :16]
-    vld1.16         {d4[1]}, [r8, :16]
-    vld1.16         {d6[1]}, [r3, :16]
-    add             r1, r2, #0x10
-    add             r9, r2, #0x40
-    add             r8, r2, #0x34
-    add             r3, r2, #0x1a
-    vld1.16         {d0[2]}, [r1, :16]
-    vld1.16         {d2[2]}, [r9, :16]
-    vld1.16         {d4[2]}, [r8, :16]
-    vld1.16         {d6[2]}, [r3, :16]
-    add             r1, r2, #0x20
-    add             r9, r2, #0x32
-    add             r8, r2, #0x42
-    add             r3, r2, #0xc
-    vld1.16         {d0[3]}, [r1, :16]
-    vld1.16         {d2[3]}, [r9, :16]
-    vld1.16         {d4[3]}, [r8, :16]
-    vld1.16         {d6[3]}, [r3, :16]
-    add             r1, r2, #0x12
-    add             r9, r2, #0x24
-    add             r8, r2, #0x50
-    add             r3, r2, #0xe
-    vld1.16         {d1[0]}, [r1, :16]
-    vld1.16         {d3[0]}, [r9, :16]
-    vld1.16         {d5[0]}, [r8, :16]
-    vld1.16         {d7[0]}, [r3, :16]
-    add             r1, r2, #0x4
-    add             r9, r2, #0x16
-    add             r8, r2, #0x60
-    add             r3, r2, #0x1c
-    vld1.16         {d1[1]}, [r1, :16]
-    vld1.16         {d3[1]}, [r9, :16]
-    vld1.16         {d5[1]}, [r8, :16]
-    vld1.16         {d7[1]}, [r3, :16]
-    add             r1, r2, #0x6
-    add             r9, r2, #0x8
-    add             r8, r2, #0x52
-    add             r3, r2, #0x2a
-    vld1.16         {d1[2]}, [r1, :16]
-    vld1.16         {d3[2]}, [r9, :16]
-    vld1.16         {d5[2]}, [r8, :16]
-    vld1.16         {d7[2]}, [r3, :16]
-    add             r1, r2, #0x14
-    add             r9, r2, #0xa
-    add             r8, r2, #0x44
-    add             r3, r2, #0x38
-    vld1.16         {d1[3]}, [r1, :16]
-    vld1.16         {d3[3]}, [r9, :16]
-    vld1.16         {d5[3]}, [r8, :16]
-    vld1.16         {d7[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q0
-    vcgt.s16        q9, q9, q1
-    vcgt.s16        q10, q10, q2
-    vcgt.s16        q11, q11, q3
-    vabs.s16        q0, q0
-    vabs.s16        q1, q1
-    vabs.s16        q2, q2
-    vabs.s16        q3, q3
-    veor            q8, q8, q0
-    veor            q9, q9, q1
-    veor            q10, q10, q2
-    veor            q11, q11, q3
-    add             r9, r4, #0x20
-    add             r8, r4, #0x80
-    add             r3, r4, #0xa0
-    vclz.i16        q0, q0
-    vclz.i16        q1, q1
-    vclz.i16        q2, q2
-    vclz.i16        q3, q3
-    vsub.i16        q0, q14, q0
-    vsub.i16        q1, q14, q1
-    vsub.i16        q2, q14, q2
-    vsub.i16        q3, q14, q3
-    vst1.16         {d0, d1, d2, d3}, [r4, :256]
-    vst1.16         {d4, d5, d6, d7}, [r9, :256]
-    vshl.s16        q0, q15, q0
-    vshl.s16        q1, q15, q1
-    vshl.s16        q2, q15, q2
-    vshl.s16        q3, q15, q3
-    vsub.i16        q0, q0, q15
-    vsub.i16        q1, q1, q15
-    vsub.i16        q2, q2, q15
-    vsub.i16        q3, q3, q15
-    vand            q8, q8, q0
-    vand            q9, q9, q1
-    vand            q10, q10, q2
-    vand            q11, q11, q3
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    add             r1, r2, #0x46
-    add             r9, r2, #0x3a
-    add             r8, r2, #0x74
-    add             r3, r2, #0x6a
-    vld1.16         {d8[0]}, [r1, :16]
-    vld1.16         {d10[0]}, [r9, :16]
-    vld1.16         {d12[0]}, [r8, :16]
-    vld1.16         {d14[0]}, [r3, :16]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    add             r1, r2, #0x54
-    add             r9, r2, #0x2c
-    add             r8, r2, #0x76
-    add             r3, r2, #0x78
-    vld1.16         {d8[1]}, [r1, :16]
-    vld1.16         {d10[1]}, [r9, :16]
-    vld1.16         {d12[1]}, [r8, :16]
-    vld1.16         {d14[1]}, [r3, :16]
-    add             r1, r2, #0x62
-    add             r9, r2, #0x1e
-    add             r8, r2, #0x68
-    add             r3, r2, #0x7a
-    vld1.16         {d8[2]}, [r1, :16]
-    vld1.16         {d10[2]}, [r9, :16]
-    vld1.16         {d12[2]}, [r8, :16]
-    vld1.16         {d14[2]}, [r3, :16]
-    add             r1, r2, #0x70
-    add             r9, r2, #0x2e
-    add             r8, r2, #0x5a
-    add             r3, r2, #0x6c
-    vld1.16         {d8[3]}, [r1, :16]
-    vld1.16         {d10[3]}, [r9, :16]
-    vld1.16         {d12[3]}, [r8, :16]
-    vld1.16         {d14[3]}, [r3, :16]
-    add             r1, r2, #0x72
-    add             r9, r2, #0x3c
-    add             r8, r2, #0x4c
-    add             r3, r2, #0x5e
-    vld1.16         {d9[0]}, [r1, :16]
-    vld1.16         {d11[0]}, [r9, :16]
-    vld1.16         {d13[0]}, [r8, :16]
-    vld1.16         {d15[0]}, [r3, :16]
-    add             r1, r2, #0x64
-    add             r9, r2, #0x4a
-    add             r8, r2, #0x3e
-    add             r3, r2, #0x6e
-    vld1.16         {d9[1]}, [r1, :16]
-    vld1.16         {d11[1]}, [r9, :16]
-    vld1.16         {d13[1]}, [r8, :16]
-    vld1.16         {d15[1]}, [r3, :16]
-    add             r1, r2, #0x56
-    add             r9, r2, #0x58
-    add             r8, r2, #0x4e
-    add             r3, r2, #0x7c
-    vld1.16         {d9[2]}, [r1, :16]
-    vld1.16         {d11[2]}, [r9, :16]
-    vld1.16         {d13[2]}, [r8, :16]
-    vld1.16         {d15[2]}, [r3, :16]
-    add             r1, r2, #0x48
-    add             r9, r2, #0x66
-    add             r8, r2, #0x5c
-    add             r3, r2, #0x7e
-    vld1.16         {d9[3]}, [r1, :16]
-    vld1.16         {d11[3]}, [r9, :16]
-    vld1.16         {d13[3]}, [r8, :16]
-    vld1.16         {d15[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q4
-    vcgt.s16        q9, q9, q5
-    vcgt.s16        q10, q10, q6
-    vcgt.s16        q11, q11, q7
-    vabs.s16        q4, q4
-    vabs.s16        q5, q5
-    vabs.s16        q6, q6
-    vabs.s16        q7, q7
-    veor            q8, q8, q4
-    veor            q9, q9, q5
-    veor            q10, q10, q6
-    veor            q11, q11, q7
-    add             r1, r4, #0x40
-    add             r9, r4, #0x60
-    add             r8, r4, #0xc0
-    add             r3, r4, #0xe0
-    vclz.i16        q4, q4
-    vclz.i16        q5, q5
-    vclz.i16        q6, q6
-    vclz.i16        q7, q7
-    vsub.i16        q4, q14, q4
-    vsub.i16        q5, q14, q5
-    vsub.i16        q6, q14, q6
-    vsub.i16        q7, q14, q7
-    vst1.16         {d8, d9, d10, d11}, [r1, :256]
-    vst1.16         {d12, d13, d14, d15}, [r9, :256]
-    vshl.s16        q4, q15, q4
-    vshl.s16        q5, q15, q5
-    vshl.s16        q6, q15, q6
-    vshl.s16        q7, q15, q7
-    vsub.i16        q4, q4, q15
-    vsub.i16        q5, q5, q15
-    vsub.i16        q6, q6, q15
-    vsub.i16        q7, q7, q15
-    vand            q8, q8, q4
-    vand            q9, q9, q5
-    vand            q10, q10, q6
-    vand            q11, q11, q7
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    ldr             r12, [r7, #0xc]       /* r12 = actbl */
-    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
-    mov             r9, r12               /* r9 = actbl */
-    add             r6, r4, #0x80         /* r6 = t2 */
-    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
-    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
-    ldrh            r2, [r6, #-128]       /* r2  = nbits */
-    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
-    ldr             r0, [lr, r2, lsl #2]
-    ldrb            r5, [r1, r2]
-    put_bits        r11, r4, r0, r5
-    checkbuf15      r10, r11, r4, r5, r0
-    put_bits        r11, r4, r3, r2
-    checkbuf15      r10, r11, r4, r5, r0
-    mov             lr, r6                /* lr = t2 */
-    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
-    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
-    veor            q8, q8, q8
-    vceq.i16        q0, q0, q8
-    vceq.i16        q1, q1, q8
-    vceq.i16        q2, q2, q8
-    vceq.i16        q3, q3, q8
-    vceq.i16        q4, q4, q8
-    vceq.i16        q5, q5, q8
-    vceq.i16        q6, q6, q8
-    vceq.i16        q7, q7, q8
-    vmovn.i16       d0, q0
-    vmovn.i16       d2, q1
-    vmovn.i16       d4, q2
-    vmovn.i16       d6, q3
-    vmovn.i16       d8, q4
-    vmovn.i16       d10, q5
-    vmovn.i16       d12, q6
-    vmovn.i16       d14, q7
-    vand            d0, d0, d26
-    vand            d2, d2, d26
-    vand            d4, d4, d26
-    vand            d6, d6, d26
-    vand            d8, d8, d26
-    vand            d10, d10, d26
-    vand            d12, d12, d26
-    vand            d14, d14, d26
-    vpadd.i8        d0, d0, d2
-    vpadd.i8        d4, d4, d6
-    vpadd.i8        d8, d8, d10
-    vpadd.i8        d12, d12, d14
-    vpadd.i8        d0, d0, d4
-    vpadd.i8        d8, d8, d12
-    vpadd.i8        d0, d0, d8
-    vmov.32         r1, d0[1]
-    vmov.32         r8, d0[0]
-    mvn             r1, r1
-    mvn             r8, r8
-    lsrs            r1, r1, #0x1
-    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
-    rbit            r1, r1            /* r1 = index1 */
-    rbit            r8, r8            /* r8 = index0 */
-    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
-    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
-    cmp             r8, #0x0
-    beq             6f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r12, sp, #0x20   /* r12 = t1 */
-    ldr             r8, [sp, #0x14]  /* r8 = index1 */
-    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
-    cmp             r8, #0x0
-    beq             6f
-    clz             r2, r8
-    sub             r12, r12, lr
-    lsl             r8, r8, r2
-    add             r2, r2, r12, lsr #1
-    add             lr, lr, r2, lsl #1
-    b               7f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-7:
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r0, sp, #0x20
-    add             r0, #0xfe
-    cmp             lr, r0
-    bhs             1f
-    ldr             r1, [r9]
-    ldrb            r0, [r5]
-    put_bits        r11, r4, r1, r0
-    checkbuf15      r10, r11, r4, r0, r1
-1:
-    ldr             r12, [sp, #0x18]
-    str             r11, [r12, #0x8]
-    str             r4, [r12, #0xc]
-    add             r0, r10, #0x1
-    add             r4, sp, #0x140
-    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vld1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             r4, r7, #0x1c
-    mov             sp, r4
-    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf15
diff --git a/media/libjpeg/simd/jsimd_i386.c b/media/libjpeg/simd/jsimd_i386.c
deleted file mode 100644
index 6da8bd8913..0000000000
--- a/media/libjpeg/simd/jsimd_i386.c
+++ /dev/null
@@ -1,1091 +0,0 @@
-/*
- * jsimd_i386.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 32-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = jpeg_simd_cpu_support();
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEMMX");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_MMX;
-  env = getenv("JSIMD_FORCE3DNOW");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_3DNOW|JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_SSE|JSIMD_MMX;
-  env = getenv("JSIMD_FORCESSE2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support &= JSIMD_SSE2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_ycc_convert_sse2;
-      mmxfct=jsimd_extrgb_ycc_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
-      mmxfct=jsimd_extrgbx_ycc_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_ycc_convert_sse2;
-      mmxfct=jsimd_extbgr_ycc_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
-      mmxfct=jsimd_extbgrx_ycc_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
-      mmxfct=jsimd_extxbgr_ycc_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
-      mmxfct=jsimd_extxrgb_ycc_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_rgb_ycc_convert_sse2;
-      mmxfct=jsimd_rgb_ycc_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-  void (*mmxfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_gray_convert_sse2;
-      mmxfct=jsimd_extrgb_gray_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_gray_convert_sse2;
-      mmxfct=jsimd_extrgbx_gray_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_gray_convert_sse2;
-      mmxfct=jsimd_extbgr_gray_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_gray_convert_sse2;
-      mmxfct=jsimd_extbgrx_gray_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_gray_convert_sse2;
-      mmxfct=jsimd_extxbgr_gray_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_gray_convert_sse2;
-      mmxfct=jsimd_extxrgb_gray_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_rgb_gray_convert_sse2;
-      mmxfct=jsimd_rgb_gray_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_ycc_extrgb_convert_sse2;
-      mmxfct=jsimd_ycc_extrgb_convert_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
-      mmxfct=jsimd_ycc_extrgbx_convert_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_ycc_extbgr_convert_sse2;
-      mmxfct=jsimd_ycc_extbgr_convert_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
-      mmxfct=jsimd_ycc_extbgrx_convert_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
-      mmxfct=jsimd_ycc_extxbgr_convert_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
-      mmxfct=jsimd_ycc_extxrgb_convert_mmx;
-      break;
-    default:
-      sse2fct=jsimd_ycc_rgb_convert_sse2;
-      mmxfct=jsimd_ycc_rgb_convert_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                               compptr->v_samp_factor,
-                               compptr->width_in_blocks, input_data,
-                               output_data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-                              compptr->v_samp_factor, compptr->width_in_blocks,
-                              input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                               compptr->v_samp_factor,
-                               compptr->width_in_blocks, input_data,
-                               output_data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
-                              compptr->v_samp_factor, compptr->width_in_blocks,
-                              input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                             input_data, output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
-                            input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                             input_data, output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, cinfo->output_width,
-                            input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-                                  compptr->downsampled_width, input_data,
-                                  output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                   compptr->downsampled_width, input_data,
-                                   output_data_ptr);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
-                                  compptr->downsampled_width, input_data,
-                                  output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
-      break;
-    default:
-      sse2fct=jsimd_h2v2_merged_upsample_sse2;
-      mmxfct=jsimd_h2v2_merged_upsample_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
-      break;
-    default:
-      sse2fct=jsimd_h2v1_merged_upsample_sse2;
-      mmxfct=jsimd_h2v1_merged_upsample_mmx;
-      break;
-  }
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-  else if (simd_support & JSIMD_MMX)
-    mmxfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_SSE)
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_convsamp_sse2(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_convsamp_mmx(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_SSE)
-    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    jsimd_fdct_islow_sse2(data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_fdct_islow_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    jsimd_fdct_ifast_sse2(data);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_fdct_ifast_mmx(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    jsimd_fdct_float_sse(data);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_fdct_float_3dnow(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-  if (simd_support & JSIMD_SSE)
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_quantize_sse2(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_quantize_mmx(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_SSE2)
-    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_SSE)
-    jsimd_quantize_float_sse(coef_block, divisors, workspace);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 1;
-  if (simd_support & JSIMD_MMX)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-  if (sizeof(FLOAT_MULT_TYPE) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    return 1;
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
-    return 1;
-  if (simd_support & JSIMD_3DNOW)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if (simd_support & JSIMD_MMX)
-    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
-                          output_col);
-  else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
-    jsimd_idct_float_sse(compptr->dct_table, coef_block, output_buf,
-                         output_col);
-  else if (simd_support & JSIMD_3DNOW)
-    jsimd_idct_float_3dnow(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
-      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimd_mips.c b/media/libjpeg/simd/jsimd_mips.c
deleted file mode 100644
index 63b8115d16..0000000000
--- a/media/libjpeg/simd/jsimd_mips.c
+++ /dev/null
@@ -1,1138 +0,0 @@
-/*
- * jsimd_mips.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * MIPS architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__)
-
-LOCAL(int)
-parse_proc_cpuinfo(const char* search_string)
-{
-  const char* file_name = "/proc/cpuinfo";
-  char cpuinfo_line[256];
-  FILE* f = NULL;
-  simd_support = 0;
-
-  if ((f = fopen(file_name, "r")) != NULL) {
-    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
-      if (strstr(cpuinfo_line, search_string) != NULL) {
-        fclose(f);
-        simd_support |= JSIMD_MIPS_DSPR2;
-        return 1;
-      }
-    }
-    fclose(f);
-  }
-  /* Did not find string in the proc file, or not Linux ELF. */
-  return 0;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__MIPSEL__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-  simd_support |= JSIMD_MIPS_DSPR2;
-#elif defined(__linux__)
-  /* We still have a chance to use MIPS DSPR2 regardless of globally used
-   * -mdspr2 options passed to gcc by performing runtime detection via
-   * /proc/cpuinfo parsing on linux */
-  if (!parse_proc_cpuinfo("MIPS 74K"))
-    return;
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEDSPR2");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_MIPS_DSPR2;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-}
-
-static const int mips_idct_ifast_coefs[4] = {
-  0x45404540,           // FIX( 1.082392200 / 2) =  17734 = 0x4546
-  0x5A805A80,           // FIX( 1.414213562 / 2) =  23170 = 0x5A82
-  0x76407640,           // FIX( 1.847759065 / 2) =  30274 = 0x7642
-  0xAC60AC60            // FIX(-2.613125930 / 4) = -21407 = 0xAC61
-};
-
-/* The following struct is borrowed from jdsample.c */
-typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
-                               jpeg_component_info *compptr,
-                               JSAMPARRAY input_data,
-                               JSAMPARRAY *output_data_ptr);
-
-typedef struct {
-  struct jpeg_upsampler pub;
-  JSAMPARRAY color_buf[MAX_COMPONENTS];
-  upsample1_ptr methods[MAX_COMPONENTS];
-  int next_row_out;
-  JDIMENSION rows_to_go;
-  int rowgroup_height[MAX_COMPONENTS];
-  UINT8 h_expand[MAX_COMPONENTS];
-  UINT8 v_expand[MAX_COMPONENTS];
-} my_upsampler;
-
-typedef my_upsampler *my_upsample_ptr;
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_c_can_null_convert (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_extrgbx_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_extbgr_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_extbgrx_ycc_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_extxbgr_ycc_convert_mips_dspr2;
-
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_extxrgb_ycc_convert_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_extrgb_ycc_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_extrgbx_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_extbgr_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_extbgrx_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_extxbgr_gray_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_extxrgb_gray_convert_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_extrgb_gray_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->image_width, input_buf, output_buf, output_row,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_ycc_extrgbx_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_ycc_extbgr_convert_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_ycc_extbgrx_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_ycc_extxbgr_convert_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_ycc_extxrgb_convert_mips_dspr2;
-      break;
-  default:
-      mipsdspr2fct=jsimd_ycc_extrgb_convert_mips_dspr2;
-      break;
-  }
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    mipsdspr2fct(cinfo->output_width, input_buf, input_row, output_buf,
-                 num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(void)
-jsimd_c_null_convert (j_compress_ptr cinfo,
-                      JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                      JDIMENSION output_row, int num_rows)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_c_null_convert_mips_dspr2(cinfo->image_width, input_buf,
-                                    output_buf, output_row, num_rows,
-                                    cinfo->num_components);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_smooth_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if(DCTSIZE != 8)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_downsample_mips_dspr2(cinfo->image_width,
-                                     cinfo->max_v_samp_factor,
-                                     compptr->v_samp_factor,
-                                     compptr->width_in_blocks, input_data,
-                                     output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v2_smooth_downsample (j_compress_ptr cinfo,
-                              jpeg_component_info *compptr,
-                              JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_smooth_downsample_mips_dspr2(input_data, output_data,
-                                          compptr->v_samp_factor,
-                                          cinfo->max_v_samp_factor,
-                                          cinfo->smoothing_factor,
-                                          compptr->width_in_blocks,
-                                          cinfo->image_width);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_downsample_mips_dspr2(cinfo->image_width,
-                                     cinfo->max_v_samp_factor,
-                                     compptr->v_samp_factor,
-                                     compptr->width_in_blocks,
-                                     input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_int_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                   cinfo->output_width, input_data,
-                                   output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                   cinfo->output_width, input_data,
-                                   output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_int_upsample (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
-{
-  my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
-
-  jsimd_int_upsample_mips_dspr2(upsample->h_expand[compptr->component_index],
-                                upsample->v_expand[compptr->component_index],
-                                input_data, output_data_ptr,
-                                cinfo->output_width,
-                                cinfo->max_v_samp_factor);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v2_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                         compptr->downsampled_width,
-                                         input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_h2v1_fancy_upsample_mips_dspr2(cinfo->max_v_samp_factor,
-                                         compptr->downsampled_width,
-                                         input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
-                       JSAMPLE *);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_h2v2_extbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_h2v2_extrgb_merged_upsample_mips_dspr2;
-      break;
-  }
-
-  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
-               cinfo->sample_range_limit);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*mipsdspr2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY,
-                       JSAMPLE *);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      mipsdspr2fct=jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGR:
-      mipsdspr2fct=jsimd_h2v1_extbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      mipsdspr2fct=jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      mipsdspr2fct=jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      mipsdspr2fct=jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2;
-      break;
-    default:
-      mipsdspr2fct=jsimd_h2v1_extrgb_merged_upsample_mips_dspr2;
-      break;
-  }
-
-  mipsdspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
-               cinfo->sample_range_limit);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_convsamp_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  if ((simd_support & JSIMD_MIPS_DSPR2))
-    jsimd_convsamp_float_mips_dspr2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_fdct_islow_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_fdct_ifast_mips_dspr2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_quantize_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_quantize_float_mips_dspr2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_6x6 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_12x12 (void)
-{
-  init_simd();
-
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    jsimd_idct_2x2_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                              output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int workspace[DCTSIZE*4];  /* buffers data between passes */
-    jsimd_idct_4x4_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                              output_col, workspace);
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-           JCOEFPTR coef_block, JSAMPARRAY output_buf,
-           JDIMENSION output_col)
-{
-    if (simd_support & JSIMD_MIPS_DSPR2)
-      jsimd_idct_6x6_mips_dspr2(compptr->dct_table, coef_block, output_buf,
-                                output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block,
-                  JSAMPARRAY output_buf, JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int workspace[96];
-    int output[12] = {
-      (int)(output_buf[0] + output_col),
-      (int)(output_buf[1] + output_col),
-      (int)(output_buf[2] + output_col),
-      (int)(output_buf[3] + output_col),
-      (int)(output_buf[4] + output_col),
-      (int)(output_buf[5] + output_col),
-      (int)(output_buf[6] + output_col),
-      (int)(output_buf[7] + output_col),
-      (int)(output_buf[8] + output_col),
-      (int)(output_buf[9] + output_col),
-      (int)(output_buf[10] + output_col),
-      (int)(output_buf[11] + output_col),
-    };
-    jsimd_idct_12x12_pass1_mips_dspr2(coef_block, compptr->dct_table,
-                                      workspace);
-    jsimd_idct_12x12_pass2_mips_dspr2(workspace, output);
-  }
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if (simd_support & JSIMD_MIPS_DSPR2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    int output[8] = {
-      (int)(output_buf[0] + output_col),
-      (int)(output_buf[1] + output_col),
-      (int)(output_buf[2] + output_col),
-      (int)(output_buf[3] + output_col),
-      (int)(output_buf[4] + output_col),
-      (int)(output_buf[5] + output_col),
-      (int)(output_buf[6] + output_col),
-      (int)(output_buf[7] + output_col),
-    };
-
-    jsimd_idct_islow_mips_dspr2(coef_block, compptr->dct_table,
-                                output, IDCT_range_limit(cinfo));
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  if (simd_support & JSIMD_MIPS_DSPR2) {
-    JCOEFPTR inptr;
-    IFAST_MULT_TYPE *quantptr;
-    DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
-
-    /* Pass 1: process columns from input, store into work array. */
-
-    inptr = coef_block;
-    quantptr = (IFAST_MULT_TYPE *) compptr->dct_table;
-
-    jsimd_idct_ifast_cols_mips_dspr2(inptr, quantptr,
-                                     workspace, mips_idct_ifast_coefs);
-
-    /* Pass 2: process rows from work array, store into output array. */
-    /* Note that we must descale the results by a factor of 8 == 2**3, */
-    /* and also undo the PASS1_BITS scaling. */
-
-    jsimd_idct_ifast_rows_mips_dspr2(workspace, output_buf,
-                                     output_col, mips_idct_ifast_coefs);
-  }
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return NULL;
-}
diff --git a/media/libjpeg/simd/jsimd_mips_dspr2.S b/media/libjpeg/simd/jsimd_mips_dspr2.S
deleted file mode 100644
index c8c286cb3e..0000000000
--- a/media/libjpeg/simd/jsimd_mips_dspr2.S
+++ /dev/null
@@ -1,4487 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
- *           Darko Laus       (darko.laus@imgtec.com)
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#include "jsimd_mips_dspr2_asm.h"
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- * 20(sp) - cinfo->num_components
- *
- * Null conversion for compression
- */
-
-    SAVE_REGS_ON_STACK 8, s0, s1
-
-    lw        t9, 24(sp)   // t9 = num_rows
-    lw        s0, 28(sp)   // s0 = cinfo->num_components
-    andi      t0, a0, 3    // t0 = cinfo->image_width & 3
-    beqz      t0, 4f       // no residual
-     nop
-0:
-    addiu     t9, t9, -1
-    bltz      t9, 7f
-     li       t1, 0
-1:
-    sll       t3, t1, 2
-    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
-    lw        t2, 0(a1)    // t2 = inptr = *input_buf
-    sll       t4, a3, 2
-    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
-    addu      t2, t2, t1
-    addu      s1, t5, a0
-    addu      t6, t5, t0
-2:
-    lbu       t3, 0(t2)
-    addiu     t5, t5, 1
-    sb        t3, -1(t5)
-    bne       t6, t5, 2b
-     addu     t2, t2, s0
-3:
-    lbu       t3, 0(t2)
-    addu      t4, t2, s0
-    addu      t7, t4, s0
-    addu      t8, t7, s0
-    addu      t2, t8, s0
-    lbu       t4, 0(t4)
-    lbu       t7, 0(t7)
-    lbu       t8, 0(t8)
-    addiu     t5, t5, 4
-    sb        t3, -4(t5)
-    sb        t4, -3(t5)
-    sb        t7, -2(t5)
-    bne       s1, t5, 3b
-     sb       t8, -1(t5)
-    addiu     t1, t1, 1
-    bne       t1, s0, 1b
-     nop
-    addiu     a1, a1, 4
-    bgez      t9, 0b
-     addiu    a3, a3, 1
-    b         7f
-     nop
-4:
-    addiu     t9, t9, -1
-    bltz      t9, 7f
-     li       t1, 0
-5:
-    sll       t3, t1, 2
-    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
-    lw        t2, 0(a1)    // t2 = inptr = *input_buf
-    sll       t4, a3, 2
-    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
-    addu      t2, t2, t1
-    addu      s1, t5, a0
-    addu      t6, t5, t0
-6:
-    lbu       t3, 0(t2)
-    addu      t4, t2, s0
-    addu      t7, t4, s0
-    addu      t8, t7, s0
-    addu      t2, t8, s0
-    lbu       t4, 0(t4)
-    lbu       t7, 0(t7)
-    lbu       t8, 0(t8)
-    addiu     t5, t5, 4
-    sb        t3, -4(t5)
-    sb        t4, -3(t5)
-    sb        t7, -2(t5)
-    bne       s1, t5, 6b
-     sb       t8, -1(t5)
-    addiu     t1, t1, 1
-    bne       t1, s0, 5b
-     nop
-    addiu     a1, a1, 4
-    bgez      t9, 4b
-     addiu    a3, a3, 1
-7:
-    RESTORE_REGS_FROM_STACK 8, s0, s1
-
-    j         ra
-     nop
-
-END(jsimd_c_null_convert_mips_dspr2)
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_ycc_convert_mips_dspr2
- * jsimd_extbgr_ycc_convert_mips_dspr2
- * jsimd_extrgbx_ycc_convert_mips_dspr2
- * jsimd_extbgrx_ycc_convert_mips_dspr2
- * jsimd_extxbgr_ycc_convert_mips_dspr2
- * jsimd_extxrgb_ycc_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_YCC r,    \
-                     g,    \
-                     b,    \
-                     inptr
-    lbu     \r, \r_offs(\inptr)
-    lbu     \g, \g_offs(\inptr)
-    lbu     \b, \b_offs(\inptr)
-    addiu   \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw      t7, 48(sp)        // t7 = num_rows
-    li      s0, 0x4c8b        // FIX(0.29900)
-    li      s1, 0x9646        // FIX(0.58700)
-    li      s2, 0x1d2f        // FIX(0.11400)
-    li      s3, 0xffffd4cd    // -FIX(0.16874)
-    li      s4, 0xffffab33    // -FIX(0.33126)
-    li      s5, 0x8000        // FIX(0.50000)
-    li      s6, 0xffff94d1    // -FIX(0.41869)
-    li      s7, 0xffffeb2f    // -FIX(0.08131)
-    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
-
-0:
-    addiu   t7, -1            // --num_rows
-    lw      t6, 0(a1)         // t6 = input_buf[0]
-    lw      t0, 0(a2)
-    lw      t1, 4(a2)
-    lw      t2, 8(a2)
-    sll     t3, a3, 2
-    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
-    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
-    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
-
-    addu    t9, t2, a0        // t9 = end address
-    addiu   a3, 1
-
-1:
-    DO_RGB_TO_YCC t3, t4, t5, t6
-
-    mtlo    s5, $ac0
-    mtlo    t8, $ac1
-    mtlo    t8, $ac2
-    maddu   $ac0, s2, t5
-    maddu   $ac1, s5, t5
-    maddu   $ac2, s5, t3
-    maddu   $ac0, s0, t3
-    maddu   $ac1, s3, t3
-    maddu   $ac2, s6, t4
-    maddu   $ac0, s1, t4
-    maddu   $ac1, s4, t4
-    maddu   $ac2, s7, t5
-    extr.w  t3, $ac0, 16
-    extr.w  t4, $ac1, 16
-    extr.w  t5, $ac2, 16
-    sb      t3, 0(t0)
-    sb      t4, 0(t1)
-    sb      t5, 0(t2)
-    addiu   t0, 1
-    addiu   t2, 1
-    bne     t2, t9, 1b
-     addiu  t1, 1
-    bgtz    t7, 0b
-     addiu  a1, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_YCC
-
-.endm
-
-/*------------------------------------------id -- pix R  G  B */
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-
-/*****************************************************************************/
-/*
- * jsimd_ycc_extrgb_convert_mips_dspr2
- * jsimd_ycc_extbgr_convert_mips_dspr2
- * jsimd_ycc_extrgbx_convert_mips_dspr2
- * jsimd_ycc_extbgrx_convert_mips_dspr2
- * jsimd_ycc_extxbgr_convert_mips_dspr2
- * jsimd_ycc_extxrgb_convert_mips_dspr2
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
-
-.macro STORE_YCC_TO_RGB  scratch0 \
-                         scratch1 \
-                         scratch2 \
-                         outptr
-    sb       \scratch0, \r_offs(\outptr)
-    sb       \scratch1, \g_offs(\outptr)
-    sb       \scratch2, \b_offs(\outptr)
-.if (\pixel_size == 4)
-    li       t0, 0xFF
-    sb       t0, \a_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - input_row
- * a3     - output_buf
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw         s1, 48(sp)
-    li         t3, 0x8000
-    li         t4, 0x166e9     // FIX(1.40200)
-    li         t5, 0x1c5a2     // FIX(1.77200)
-    li         t6, 0xffff492e  // -FIX(0.71414)
-    li         t7, 0xffffa7e6  // -FIX(0.34414)
-    repl.ph    t8, 128
-
-0:
-    lw         s0, 0(a3)
-    lw         t0, 0(a1)
-    lw         t1, 4(a1)
-    lw         t2, 8(a1)
-    sll        s5, a2, 2
-    addiu      s1, -1
-    lwx        s2, s5(t0)
-    lwx        s3, s5(t1)
-    lwx        s4, s5(t2)
-    addu       t9, s2, a0
-    addiu      a2, 1
-
-1:
-    lbu        s7, 0(s4)       // cr
-    lbu        s6, 0(s3)       // cb
-    lbu        s5, 0(s2)       // y
-    addiu      s2, 1
-    addiu      s4, 1
-    addiu      s7, -128
-    addiu      s6, -128
-    mul        t2, t7, s6
-    mul        t0, t6, s7      // Crgtab[cr]
-    sll        s7, 15
-    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
-    sll        s6, 15
-    addu       t2, t3          // Cbgtab[cb]
-    addu       t2, t0
-
-    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
-    sra        t2, 16
-    addu       t1, s5
-    addu       t2, s5          // add y
-    ins        t2, t1, 16, 16
-    subu.ph    t2, t2, t8
-    addu       t0, s5
-    shll_s.ph  t2, t2, 8
-    subu       t0, 128
-    shra.ph    t2, t2, 8
-    shll_s.w   t0, t0, 24
-    addu.ph    t2, t2, t8      // clip & store
-    sra        t0, t0, 24
-    sra        t1, t2, 16
-    addiu      t0, 128
-
-    STORE_YCC_TO_RGB t1, t2, t0, s0
-
-    bne        s2, t9, 1b
-     addiu     s3, 1
-    bgtz       s1, 0b
-     addiu     a3, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
-
-.purgem STORE_YCC_TO_RGB
-
-.endm
-
-/*------------------------------------------id -- pix R  G  B  A */
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
-GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
-
-/*****************************************************************************/
-/*
- * jsimd_extrgb_gray_convert_mips_dspr2
- * jsimd_extbgr_gray_convert_mips_dspr2
- * jsimd_extrgbx_gray_convert_mips_dspr2
- * jsimd_extbgrx_gray_convert_mips_dspr2
- * jsimd_extxbgr_gray_convert_mips_dspr2
- * jsimd_extxrgb_gray_convert_mips_dspr2
- *
- * Colorspace conversion RGB -> GRAY
- */
-
-.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
-
-.macro DO_RGB_TO_GRAY r,    \
-                      g,    \
-                      b,    \
-                      inptr
-    lbu     \r, \r_offs(\inptr)
-    lbu     \g, \g_offs(\inptr)
-    lbu     \b, \b_offs(\inptr)
-    addiu   \inptr, \pixel_size
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - input_buf
- * a2     - output_buf
- * a3     - output_row
- * 16(sp) - num_rows
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    li      s0, 0x4c8b             // s0 = FIX(0.29900)
-    li      s1, 0x9646             // s1 = FIX(0.58700)
-    li      s2, 0x1d2f             // s2 = FIX(0.11400)
-    li      s7, 0x8000             // s7 = FIX(0.50000)
-    lw      s6, 48(sp)
-    andi    t7, a0, 3
-
-0:
-    addiu   s6, -1                 // s6 = num_rows
-    lw      t0, 0(a1)
-    lw      t1, 0(a2)
-    sll     t3, a3, 2
-    lwx     t1, t3(t1)
-    addiu   a3, 1
-    addu    t9, t1, a0
-    subu    t8, t9, t7
-    beq     t1, t8, 2f
-     nop
-
-1:
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-    DO_RGB_TO_GRAY s3, s4, s5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    maddu   $ac0, s0, t3
-    mtlo    s7, $ac1
-    maddu   $ac1, s2, s5
-    maddu   $ac1, s1, s4
-    maddu   $ac1, s0, s3
-    extr.w  t6, $ac0, 16
-
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-    DO_RGB_TO_GRAY s3, s4, s5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    extr.w  t2, $ac1, 16
-    maddu   $ac0, s0, t3
-    mtlo    s7, $ac1
-    maddu   $ac1, s2, s5
-    maddu   $ac1, s1, s4
-    maddu   $ac1, s0, s3
-    extr.w  t5, $ac0, 16
-    sb      t6, 0(t1)
-    sb      t2, 1(t1)
-    extr.w  t3, $ac1, 16
-    addiu   t1, 4
-    sb      t5, -2(t1)
-    sb      t3, -1(t1)
-    bne     t1, t8, 1b
-     nop
-
-2:
-    beqz    t7, 4f
-     nop
-
-3:
-    DO_RGB_TO_GRAY t3, t4, t5, t0
-
-    mtlo    s7, $ac0
-    maddu   $ac0, s2, t5
-    maddu   $ac0, s1, t4
-    maddu   $ac0, s0, t3
-    extr.w  t6, $ac0, 16
-    sb      t6, 0(t1)
-    addiu   t1, 1
-    bne     t1, t9, 3b
-     nop
-
-4:
-    bgtz    s6, 0b
-     addiu  a1, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j ra
-     nop
-END(jsimd_\colorid\()_gray_convert_mips_dspr2)
-
-.purgem DO_RGB_TO_GRAY
-
-.endm
-
-/*------------------------------------------id --  pix R  G  B */
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
-GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
-/*****************************************************************************/
-/*
- * jsimd_h2v2_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v2 upsample routines
- */
-.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
-                                                pixel_size, \
-                                                r1_offs,    \
-                                                g1_offs,    \
-                                                b1_offs,    \
-                                                a1_offs,    \
-                                                r2_offs,    \
-                                                g2_offs,    \
-                                                b2_offs,    \
-                                                a2_offs
-
-.macro STORE_H2V2_2_PIXELS  scratch0 \
-                            scratch1 \
-                            scratch2 \
-                            scratch3 \
-                            scratch4 \
-                            scratch5 \
-                            outptr
-    sb       \scratch0, \r1_offs(\outptr)
-    sb       \scratch1, \g1_offs(\outptr)
-    sb       \scratch2, \b1_offs(\outptr)
-    sb       \scratch3, \r2_offs(\outptr)
-    sb       \scratch4, \g2_offs(\outptr)
-    sb       \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
-    li       \scratch0, 0xFF
-    sb       \scratch0, \a1_offs(\outptr)
-    sb       \scratch0, \a2_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V2_1_PIXEL  scratch0 \
-                           scratch1 \
-                           scratch2 \
-                           outptr
-    sb    \scratch0, \r1_offs(\outptr)
-    sb    \scratch1, \g1_offs(\outptr)
-    sb    \scratch2, \b1_offs(\outptr)
-
-.if (\pixel_size == 8)
-    li    t0, 0xFF
-    sb    t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0     - cinfo->output_width
- * a1     - input_buf
- * a2     - in_row_group_ctr
- * a3     - output_buf
- * 16(sp) - cinfo->sample_range_limit
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    lw           t9, 56(sp)        // cinfo->sample_range_limit
-    lw           v0, 0(a1)
-    lw           v1, 4(a1)
-    lw           t0, 8(a1)
-    sll          t1, a2, 3
-    addiu        t2, t1, 4
-    sll          t3, a2, 2
-    lw           t4, 0(a3)         // t4 = output_buf[0]
-    lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
-    lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
-    lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
-    lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
-    lw           t7, 4(a3)         // t7 = output_buf[1]
-    li           s1, 0xe6ea
-    addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
-    addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
-    addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
-    xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
-    srl          t3, a0, 1
-    blez         t3, 2f
-     addu        t0, t5, t3        // t0 = end address
- 1:
-    lbu          t3, 0(t5)
-    lbu          s3, 0(t6)
-    addiu        t5, t5, 1
-    addiu        t3, t3, -128      // (cb - 128)
-    addiu        s3, s3, -128      // (cr - 128)
-    mult         $ac1, s1, t3
-    madd         $ac1, s2, s3
-    sll          s3, s3, 15
-    sll          t3, t3, 15
-    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
-    extr_r.w     s5, $ac1, 16
-    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
-    lbu          v0, 0(t1)
-    addiu        t6, t6, 1
-    addiu        t1, t1, 2
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          AT, 0(t3)
-    lbu          s7, 0(s3)
-    lbu          ra, 0(v1)
-    lbu          v0, -1(t1)
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-    lbu          v0, 0(t2)
-
-    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
-
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          AT, 0(t3)
-    lbu          s7, 0(s3)
-    lbu          ra, 0(v1)
-    lbu          v0, 1(t2)
-    addiu        t2, t2, 2
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-
-    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
-
-    bne          t0, t5, 1b
-     nop
-2:
-    andi         t0, a0, 1
-    beqz         t0, 4f
-     lbu          t3, 0(t5)
-    lbu          s3, 0(t6)
-    addiu        t3, t3, -128      // (cb - 128)
-    addiu        s3, s3, -128      // (cr - 128)
-    mult         $ac1, s1, t3
-    madd         $ac1, s2, s3
-    sll          s3, s3, 15
-    sll          t3, t3, 15
-    lbu          v0, 0(t1)
-    extr_r.w     s5, $ac1, 16
-    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
-    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-    lbu          v0, 0(t2)
-
-    STORE_H2V2_1_PIXEL t3, s3, v1, t4
-
-    addu         t3, v0, s4        // y+cred
-    addu         s3, v0, s5        // y+cgreen
-    addu         v1, v0, s6        // y+cblue
-    addu         t3, t9, t3        // y+cred
-    addu         s3, t9, s3        // y+cgreen
-    addu         v1, t9, v1        // y+cblue
-    lbu          t3, 0(t3)
-    lbu          s3, 0(s3)
-    lbu          v1, 0(v1)
-
-    STORE_H2V2_1_PIXEL t3, s3, v1, t7
-4:
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    j           ra
-     nop
-
-END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V2_1_PIXEL
-.purgem STORE_H2V2_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v1_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
- * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
- * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
- *
- * Merged h2v1 upsample routines
- */
-
-.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
-                                                pixel_size, \
-                                                r1_offs,    \
-                                                g1_offs,    \
-                                                b1_offs,    \
-                                                a1_offs,    \
-                                                r2_offs,    \
-                                                g2_offs,    \
-                                                b2_offs,    \
-                                                a2_offs
-
-.macro STORE_H2V1_2_PIXELS  scratch0 \
-                            scratch1 \
-                            scratch2 \
-                            scratch3 \
-                            scratch4 \
-                            scratch5 \
-                            outptr
-    sb       \scratch0, \r1_offs(\outptr)
-    sb       \scratch1, \g1_offs(\outptr)
-    sb       \scratch2, \b1_offs(\outptr)
-    sb       \scratch3, \r2_offs(\outptr)
-    sb       \scratch4, \g2_offs(\outptr)
-    sb       \scratch5, \b2_offs(\outptr)
-.if (\pixel_size == 8)
-    li       t0, 0xFF
-    sb       t0, \a1_offs(\outptr)
-    sb       t0, \a2_offs(\outptr)
-.endif
-    addiu    \outptr, \pixel_size
-.endm
-
-.macro STORE_H2V1_1_PIXEL  scratch0 \
-                           scratch1 \
-                           scratch2 \
-                           outptr
-    sb    \scratch0, \r1_offs(\outptr)
-    sb    \scratch1, \g1_offs(\outptr)
-    sb    \scratch2, \b1_offs(\outptr)
-.if (\pixel_size == 8)
-    li    t0, 0xFF
-    sb    t0, \a1_offs(\outptr)
-.endif
-.endm
-
-LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-/*
- * a0     - cinfo->output_width
- * a1     - input_buf
- * a2     - in_row_group_ctr
- * a3     - output_buf
- * 16(sp) - range_limit
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    li           t0, 0xe6ea
-    lw           t1, 0(a1)         // t1 = input_buf[0]
-    lw           t2, 4(a1)         // t2 = input_buf[1]
-    lw           t3, 8(a1)         // t3 = input_buf[2]
-    lw           t8, 56(sp)        // t8 = range_limit
-    addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
-    addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
-    addiu        s0, t0, 0x9916    // s0 = 0x8000
-    addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
-    xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
-    srl          t0, a0, 1
-    sll          t4, a2, 2
-    lwx          s5, t4(t1)        // s5 = inptr0
-    lwx          s6, t4(t2)        // s6 = inptr1
-    lwx          s7, t4(t3)        // s7 = inptr2
-    lw           t7, 0(a3)         // t7 = outptr
-    blez         t0, 2f
-     addu        t9, s6, t0        // t9 = end address
-1:
-    lbu          t2, 0(s6)         // t2 = cb
-    lbu          t0, 0(s7)         // t0 = cr
-    lbu          t1, 0(s5)         // t1 = y
-    addiu        t2, t2, -128      // t2 = cb - 128
-    addiu        t0, t0, -128      // t0 = cr - 128
-    mult         $ac1, s4, t2
-    madd         $ac1, s3, t0
-    sll          t0, t0, 15
-    sll          t2, t2, 15
-    mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
-    extr_r.w     t5, $ac1, 16
-    mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
-    addiu        s7, s7, 1
-    addiu        s6, s6, 1
-    addu         t2, t1, t0        // t2 = y + cred
-    addu         t3, t1, t5        // t3 = y + cgreen
-    addu         t4, t1, t6        // t4 = y + cblue
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t1, 1(s5)
-    lbu          v0, 0(t2)
-    lbu          v1, 0(t3)
-    lbu          ra, 0(t4)
-    addu         t2, t1, t0
-    addu         t3, t1, t5
-    addu         t4, t1, t6
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t2, 0(t2)
-    lbu          t3, 0(t3)
-    lbu          t4, 0(t4)
-
-    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
-
-    bne          t9, s6, 1b
-     addiu       s5, s5, 2
-2:
-    andi         t0, a0, 1
-    beqz         t0, 4f
-     nop
-3:
-    lbu          t2, 0(s6)
-    lbu          t0, 0(s7)
-    lbu          t1, 0(s5)
-    addiu        t2, t2, -128      //(cb - 128)
-    addiu        t0, t0, -128      //(cr - 128)
-    mul          t3, s4, t2
-    mul          t4, s3, t0
-    sll          t0, t0, 15
-    sll          t2, t2, 15
-    mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
-    mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
-    addu         t3, t3, s0
-    addu         t3, t4, t3
-    sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
-    addu         t2, t1, t0       // y + cred
-    addu         t3, t1, t5       // y + cgreen
-    addu         t4, t1, t6       // y + cblue
-    addu         t2, t8, t2
-    addu         t3, t8, t3
-    addu         t4, t8, t4
-    lbu          t2, 0(t2)
-    lbu          t3, 0(t3)
-    lbu          t4, 0(t4)
-
-    STORE_H2V1_1_PIXEL t2, t3, t4, t7
-4:
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
-
-    j            ra
-     nop
-
-END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
-
-.purgem STORE_H2V1_1_PIXEL
-.purgem STORE_H2V1_2_PIXELS
-.endm
-
-/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
-GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
-/*****************************************************************************/
-/*
- * jsimd_h2v2_fancy_upsample_mips_dspr2
- *
- * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
- */
-LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - downsampled_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
-    li             s4, 0
-    lw             s2, 0(a3)       // s2 = *output_data_ptr
-0:
-    li             t9, 2
-    lw             s1, -4(a2)      // s1 = inptr1
-
-1:
-    lw             s0, 0(a2)       // s0 = inptr0
-    lwx            s3, s4(s2)
-    addiu          s5, a1, -2      // s5 = downsampled_width - 2
-    srl            t4, s5, 1
-    sll            t4, t4, 1
-    lbu            t0, 0(s0)
-    lbu            t1, 1(s0)
-    lbu            t2, 0(s1)
-    lbu            t3, 1(s1)
-    addiu          s0, 2
-    addiu          s1, 2
-    addu           t8, s0, t4      // t8 = end address
-    andi           s5, s5, 1       // s5 = residual
-    sll            t4, t0, 1
-    sll            t6, t1, 1
-    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
-    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
-    addu           t7, t0, t2      // t7 = thiscolsum
-    addu           t6, t1, t3      // t5 = nextcolsum
-    sll            t0, t7, 2       // t0 = thiscolsum * 4
-    subu           t1, t0, t7      // t1 = thiscolsum * 3
-    shra_r.w       t0, t0, 4
-    addiu          t1, 7
-    addu           t1, t1, t6
-    srl            t1, t1, 4
-    sb             t0, 0(s3)
-    sb             t1, 1(s3)
-    beq            t8, s0, 22f     // skip to final iteration if width == 3
-     addiu          s3, 2
-2:
-    lh             t0, 0(s0)       // t0 = A3|A2
-    lh             t2, 0(s1)       // t2 = B3|B2
-    addiu          s0, 2
-    addiu          s1, 2
-    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
-    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
-    shll.ph        t1, t0, 1
-    sll            t3, t6, 1
-    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
-    addu           t3, t3, t6      // t3 = this * 3
-    addu.ph        t0, t0, t2      // t0 = next2|next1
-    addu           t1, t3, t7
-    andi           t7, t0, 0xFFFF  // t7 = next1
-    sll            t2, t7, 1
-    addu           t2, t7, t2      // t2 = next1*3
-    addu           t4, t2, t6
-    srl            t6, t0, 16      // t6 = next2
-    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
-    addu           t0, t3, t7
-    addiu          t0, 7
-    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
-    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
-    addu           t2, t2, t6
-    addiu          t2, 7
-    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    sb             t4, 2(s3)
-    sb             t2, 3(s3)
-    bne            t8, s0, 2b
-     addiu         s3, 4
-22:
-    beqz           s5, 4f
-     addu          t8, s0, s5
-3:
-    lbu            t0, 0(s0)
-    lbu            t2, 0(s1)
-    addiu          s0, 1
-    addiu          s1, 1
-    sll            t3, t6, 1
-    sll            t1, t0, 1
-    addu           t1, t0, t1      // t1 = inptr0 * 3
-    addu           t3, t3, t6      // t3 = thiscolsum * 3
-    addu           t5, t1, t2
-    addu           t1, t3, t7
-    shra_r.w       t1, t1, 4
-    addu           t0, t3, t5
-    addiu          t0, 7
-    srl            t0, t0, 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    addiu          s3, 2
-    move           t7, t6
-    bne            t8, s0, 3b
-     move          t6, t5
-4:
-    sll            t0, t6, 2       // t0 = thiscolsum * 4
-    subu           t1, t0, t6      // t1 = thiscolsum * 3
-    addu           t1, t1, t7
-    addiu          s4, 4
-    shra_r.w       t1, t1, 4
-    addiu          t0, 7
-    srl            t0, t0, 4
-    sb             t1, 0(s3)
-    sb             t0, 1(s3)
-    addiu          t9, -1
-    addiu          s3, 2
-    bnez           t9, 1b
-     lw            s1, 4(a2)
-    srl            t0, s4, 2
-    subu           t0, a0, t0
-    bgtz           t0, 0b
-     addiu         a2, 4
-
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
-    j ra
-     nop
-END(jsimd_h2v2_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - downsampled_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    .set at
-
-    beqz           a0, 3f
-     sll           t0, a0, 2
-    lw             s1, 0(a3)
-    li             s3, 0x10001
-    addu           s0, s1, t0
-0:
-    addiu          t8, a1, -2
-    srl            t9, t8, 2
-    lw             t7, 0(a2)
-    lw             s2, 0(s1)
-    lbu            t0, 0(t7)
-    lbu            t1, 1(t7)   // t1 = inptr[1]
-    sll            t2, t0, 1
-    addu           t2, t2, t0  // t2 = invalue*3
-    addu           t2, t2, t1
-    shra_r.w       t2, t2, 2
-    sb             t0, 0(s2)
-    sb             t2, 1(s2)
-    beqz           t9, 11f
-     addiu         s2, 2
-1:
-    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
-    ulw            t1, 1(t7)
-    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
-    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
-    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
-    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
-    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
-    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
-    shll.ph        t5, t4, 1
-    shll.ph        t6, t1, 1
-    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
-    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
-    addu.ph        t4, t3, s3
-    addu.ph        t0, t0, s3
-    addu.ph        t4, t4, t5
-    addu.ph        t0, t0, t6
-    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
-    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
-    addu.ph        t2, t2, t5
-    addu.ph        t3, t3, t6
-    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
-    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
-    shll.ph        t2, t2, 8
-    shll.ph        t3, t3, 8
-    or             t2, t4, t2
-    or             t3, t3, t0
-    addiu          t9, -1
-    usw            t3, 0(s2)
-    usw            t2, 4(s2)
-    addiu          s2, 8
-    bgtz           t9, 1b
-     addiu         t7, 4
-11:
-    andi           t8, 3
-    beqz           t8, 22f
-     addiu         t7, 1
-
-2:
-    lbu            t0, 0(t7)
-    addiu          t7, 1
-    sll            t1, t0, 1
-    addu           t2, t0, t1  // t2 = invalue
-    lbu            t3, -2(t7)
-    lbu            t4, 0(t7)
-    addiu          t3, 1
-    addiu          t4, 2
-    addu           t3, t3, t2
-    addu           t4, t4, t2
-    srl            t3, 2
-    srl            t4, 2
-    sb             t3, 0(s2)
-    sb             t4, 1(s2)
-    addiu          t8, -1
-    bgtz           t8, 2b
-     addiu         s2, 2
-
-22:
-    lbu            t0, 0(t7)
-    lbu            t2, -1(t7)
-    sll            t1, t0, 1
-    addu           t1, t1, t0 // t1 = invalue * 3
-    addu           t1, t1, t2
-    addiu          t1, 1
-    srl            t1, t1, 2
-    sb             t1, 0(s2)
-    sb             t0, 1(s2)
-    addiu          s1, 4
-    bne            s1, s0, 0b
-     addiu         a2, 4
-3:
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j              ra
-     nop
-END(jsimd_h2v1_fancy_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
-/*
- * a0     - cinfo->image_width
- * a1     - cinfo->max_v_samp_factor
- * a2     - compptr->v_samp_factor
- * a3     - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
-
-    beqz        a2, 7f
-     lw         s1, 44(sp)  // s1 = output_data
-    lw          s0, 40(sp)  // s0 = input_data
-    srl         s2, a0, 2
-    andi        t9, a0, 2
-    srl         t7, t9, 1
-    addu        s2, t7, s2
-    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
-    srl         t7, t0, 1
-    subu        s2, t7, s2
-0:
-    andi        t6, a0, 1   // t6 = temp_index
-    addiu       t6, -1
-    lw          t4, 0(s1)   // t4 = outptr
-    lw          t5, 0(s0)   // t5 = inptr0
-    li          s3, 0       // s3 = bias
-    srl         t7, a0, 1   // t7 = image_width1
-    srl         s4, t7, 2
-    andi        t8, t7, 3
-1:
-    ulhu        t0, 0(t5)
-    ulhu        t1, 2(t5)
-    ulhu        t2, 4(t5)
-    ulhu        t3, 6(t5)
-    raddu.w.qb  t0, t0
-    raddu.w.qb  t1, t1
-    raddu.w.qb  t2, t2
-    raddu.w.qb  t3, t3
-    shra.ph     t0, t0, 1
-    shra_r.ph   t1, t1, 1
-    shra.ph     t2, t2, 1
-    shra_r.ph   t3, t3, 1
-    sb          t0, 0(t4)
-    sb          t1, 1(t4)
-    sb          t2, 2(t4)
-    sb          t3, 3(t4)
-    addiu       s4, -1
-    addiu       t4, 4
-    bgtz        s4, 1b
-     addiu      t5, 8
-    beqz        t8, 3f
-     addu       s4, t4, t8
-2:
-    ulhu        t0, 0(t5)
-    raddu.w.qb  t0, t0
-    addqh.w     t0, t0, s3
-    xori        s3, s3, 1
-    sb          t0, 0(t4)
-    addiu       t4, 1
-    bne         t4, s4, 2b
-     addiu      t5, 2
-3:
-    lbux        t1, t6(t5)
-    sll         t1, 1
-    addqh.w     t2, t1, s3  // t2 = pixval1
-    xori        s3, s3, 1
-    addqh.w     t3, t1, s3  // t3 = pixval2
-    blez        s2, 5f
-     append     t3, t2,  8
-    addu        t5, t4, s2  // t5 = loop_end2
-4:
-    ush         t3, 0(t4)
-    addiu       s2, -1
-    bgtz        s2, 4b
-     addiu      t4,  2
-5:
-    beqz        t9, 6f
-     nop
-    sb          t2, 0(t4)
-6:
-    addiu       s1, 4
-    addiu       a2, -1
-    bnez        a2, 0b
-     addiu      s0, 4
-7:
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
-
-    j           ra
-    nop
-END(jsimd_h2v1_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
-
-/*
- * a0     - cinfo->image_width
- * a1     - cinfo->max_v_samp_factor
- * a2     - compptr->v_samp_factor
- * a3     - compptr->width_in_blocks
- * 16(sp) - input_data
- * 20(sp) - output_data
- */
-    .set at
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    beqz         a2, 8f
-     lw          s1, 52(sp)      // s1 = output_data
-    lw           s0, 48(sp)      // s0 = input_data
-
-    andi         t6, a0, 1       // t6 = temp_index
-    addiu        t6, -1
-    srl          t7, a0, 1       // t7 = image_width1
-    srl          s4, t7, 2
-    andi         t8, t7, 3
-    andi         t9, a0, 2
-    srl          s2, a0, 2
-    srl          t7, t9, 1
-    addu         s2, t7, s2
-    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
-    srl          t7, t0, 1
-    subu         s2, t7, s2
-0:
-    lw           t4, 0(s1)       // t4 = outptr
-    lw           t5, 0(s0)       // t5 = inptr0
-    lw           s7, 4(s0)       // s7 = inptr1
-    li           s6, 1           // s6 = bias
-2:
-    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
-    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
-    ulw          t2, 4(t5)
-    ulw          t3, 4(s7)
-    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
-    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
-    raddu.w.qb   t1, t7
-    raddu.w.qb   t0, t0
-    shra_r.w     t1, t1, 2
-    addiu        t0, 1
-    srl          t0, 2
-    precrq.ph.w  t7, t2, t3
-    ins          t2, t3, 16, 16
-    raddu.w.qb   t7, t7
-    raddu.w.qb   t2, t2
-    shra_r.w     t7, t7, 2
-    addiu        t2, 1
-    srl          t2, 2
-    sb           t0, 0(t4)
-    sb           t1, 1(t4)
-    sb           t2, 2(t4)
-    sb           t7, 3(t4)
-    addiu        t4, 4
-    addiu        t5, 8
-    addiu        s4, s4, -1
-    bgtz         s4, 2b
-     addiu       s7, 8
-    beqz         t8, 4f
-     addu        t8, t4, t8
-3:
-    ulhu         t0, 0(t5)
-    ulhu         t1, 0(s7)
-    ins          t0, t1, 16, 16
-    raddu.w.qb   t0, t0
-    addu         t0, t0, s6
-    srl          t0, 2
-    xori         s6, s6, 3
-    sb           t0, 0(t4)
-    addiu        t5, 2
-    addiu        t4, 1
-    bne          t8, t4, 3b
-     addiu       s7, 2
-4:
-    lbux         t1, t6(t5)
-    sll          t1, 1
-    lbux         t0, t6(s7)
-    sll          t0, 1
-    addu         t1, t1, t0
-    addu         t3, t1, s6
-    srl          t0, t3, 2       // t2 = pixval1
-    xori         s6, s6, 3
-    addu         t2, t1, s6
-    srl          t1, t2, 2       // t3 = pixval2
-    blez         s2, 6f
-     append      t1, t0, 8
-5:
-    ush          t1, 0(t4)
-    addiu        s2, -1
-    bgtz         s2, 5b
-     addiu       t4, 2
-6:
-    beqz         t9, 7f
-     nop
-    sb           t0, 0(t4)
-7:
-    addiu        s1, 4
-    addiu        a2, -1
-    bnez         a2, 0b
-     addiu       s0, 8
-8:
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j            ra
-     nop
-END(jsimd_h2v2_downsample_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
-/*
- * a0     - input_data
- * a1     - output_data
- * a2     - compptr->v_samp_factor
- * a3     - cinfo->max_v_samp_factor
- * 16(sp) - cinfo->smoothing_factor
- * 20(sp) - compptr->width_in_blocks
- * 24(sp) - cinfo->image_width
- */
-
-    .set at
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw          s7, 52(sp)      // compptr->width_in_blocks
-    lw          s0, 56(sp)      // cinfo->image_width
-    lw          s6, 48(sp)      // cinfo->smoothing_factor
-    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
-    sll         v0, s7, 1
-    subu        v0, v0, s0
-    blez        v0, 2f
-    move        v1, zero
-    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
-0:
-    addiu       t1, a0, -4
-    sll         t2, v1, 2
-    lwx         t1, t2(t1)
-    move        t3, v0
-    addu        t1, t1, s0
-    lbu         t2, -1(t1)
-1:
-    addiu       t3, t3, -1
-    sb          t2, 0(t1)
-    bgtz        t3, 1b
-    addiu       t1, t1, 1
-    addiu       v1, v1, 1
-    bne         v1, t0, 0b
-    nop
-2:
-    li          v0, 80
-    mul         v0, s6, v0
-    li          v1, 16384
-    move        t4, zero
-    move        t5, zero
-    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
-    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
-3:
-/* Special case for first column: pretend column -1 is same as column 0 */
-    sll         v0, t4, 2
-    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
-    sll         v1, t5, 2
-    addiu       t9, v1, 4
-    addiu       s0, v1, -4
-    addiu       s1, v1, 8
-    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
-    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
-    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
-    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, 0(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, 0(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1,t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, 0(s0)
-    lbu         t0, 0(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1,s3, t7
-    extr_r.w    v0, $ac1, 16
-    addiu       t8, t8, 1
-    addiu       s2, s2, 2
-    addiu       t9, t9, 2
-    addiu       s0, s0, 2
-    addiu       s1, s1, 2
-    sb          v0, -1(t8)
-    addiu       s4, s7, -2
-    and         s4, s4, 3
-    addu        s5, s4, t8      //end adress
-4:
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    addiu       t8, t8, 1
-    addiu       s2, s2, 2
-    addiu       t9, t9, 2
-    addiu       s0, s0, 2
-    sb          t2, -1(t8)
-    bne         s5, t8, 4b
-    addiu       s1, s1, 2
-    addiu       s5, s7, -2
-    subu        s5, s5, s4
-    addu        s5, s5, t8      //end adress
-5:
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 2(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 2(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 2(s0)
-    addu        t0, t0, v0
-    lbu         t3, 2(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    lh          v1, 2(t9)
-    addu        t0, t0, v0
-    lh          v0, 2(s2)
-    addu        s3, t0, s3
-    lh          t0, 2(s0)
-    lh          t1, 2(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 4(s2)
-    lbu         t0, 1(t9)
-    lbu         t1, 4(t9)
-    sb          t2, 0(t8)
-    raddu.w.qb  t3, v0
-    lbu         v0, 1(s2)
-    addu        t0, t0, t1
-    mult        $ac1, t3, t6
-    addu        v0, v0, v1
-    lbu         t2, 4(s0)
-    addu        t0, t0, v0
-    lbu         v0, 1(s0)
-    addu        s3, t0, s3
-    lbu         t0, 1(s1)
-    lbu         t3, 4(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    lh          v1, 4(t9)
-    addu        t0, t0, v0
-    lh          v0, 4(s2)
-    addu        s3, t0, s3
-    lh          t0, 4(s0)
-    lh          t1, 4(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t2, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 6(s2)
-    lbu         t0, 3(t9)
-    lbu         t1, 6(t9)
-    sb          t2, 1(t8)
-    raddu.w.qb  t3, v0
-    lbu         v0, 3(s2)
-    addu        t0, t0,t1
-    mult        $ac1, t3, t6
-    addu        v0, v0, v1
-    lbu         t2, 6(s0)
-    addu        t0, t0, v0
-    lbu         v0, 3(s0)
-    addu        s3, t0, s3
-    lbu         t0, 3(s1)
-    lbu         t3, 6(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    lh          v1, 6(t9)
-    addu        t0, t0, v0
-    lh          v0, 6(s2)
-    addu        s3, t0, s3
-    lh          t0, 6(s0)
-    lh          t1, 6(s1)
-    madd        $ac1, s3, t7
-    extr_r.w    t3, $ac1, 16
-    ins         t0, t1, 16, 16
-    ins         v0, v1, 16, 16
-    raddu.w.qb  s3, t0
-    lbu         v1, 8(s2)
-    lbu         t0, 5(t9)
-    lbu         t1, 8(t9)
-    sb          t3, 2(t8)
-    raddu.w.qb  t2, v0
-    lbu         v0, 5(s2)
-    addu        t0, t0, t1
-    mult        $ac1, t2, t6
-    addu        v0, v0, v1
-    lbu         t2, 8(s0)
-    addu        t0, t0, v0
-    lbu         v0, 5(s0)
-    addu        s3, t0, s3
-    lbu         t0, 5(s1)
-    lbu         t3, 8(s1)
-    addu        v0, v0, t2
-    sll         s3, s3, 1
-    addu        t0, t0, t3
-    addiu       t8, t8, 4
-    addu        t0, t0, v0
-    addiu       s2, s2, 8
-    addu        s3, t0, s3
-    addiu       t9, t9, 8
-    madd        $ac1, s3, t7
-    extr_r.w    t1, $ac1, 16
-    addiu       s0, s0, 8
-    addiu       s1, s1, 8
-    bne         s5, t8, 5b
-    sb          t1, -1(t8)
-/* Special case for last column */
-    lh          v0, 0(s2)
-    lh          v1, 0(t9)
-    lh          t0, 0(s0)
-    lh          t1, 0(s1)
-    ins         v0, v1, 16, 16
-    ins         t0, t1, 16, 16
-    raddu.w.qb  t2, v0
-    raddu.w.qb  s3, t0
-    lbu         v0, -1(s2)
-    lbu         v1, 1(s2)
-    lbu         t0, -1(t9)
-    lbu         t1, 1(t9)
-    addu        v0, v0, v1
-    mult        $ac1, t2, t6
-    addu        t0, t0, t1
-    lbu         t2, 1(s0)
-    addu        t0, t0, v0
-    lbu         t3, 1(s1)
-    addu        s3, t0, s3
-    lbu         v0, -1(s0)
-    lbu         t0, -1(s1)
-    sll         s3, s3, 1
-    addu        v0, v0, t2
-    addu        t0, t0, t3
-    addu        t0, t0, v0
-    addu        s3, t0, s3
-    madd        $ac1, s3, t7
-    extr_r.w    t0, $ac1, 16
-    addiu       t5, t5, 2
-    sb          t0, 0(t8)
-    addiu       t4, t4, 1
-    bne         t4, a2, 3b
-    addiu       t5, t5, 2
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j           ra
-     nop
-
-END(jsimd_h2v2_smooth_downsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
-/*
- * a0     - upsample->h_expand[compptr->component_index]
- * a1     - upsample->v_expand[compptr->component_index]
- * a2     - input_data
- * a3     - output_data_ptr
- * 16(sp) - cinfo->output_width
- * 20(sp) - cinfo->max_v_samp_factor
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    lw      s0, 0(a3)    // s0 = output_data
-    lw      s1, 32(sp)   // s1 = cinfo->output_width
-    lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
-    li      t6, 0        // t6 = inrow
-    beqz    s2, 10f
-     li     s3, 0        // s3 = outrow
-0:
-    addu    t0, a2, t6
-    addu    t7, s0, s3
-    lw      t3, 0(t0)    // t3 = inptr
-    lw      t8, 0(t7)    // t8 = outptr
-    beqz    s1, 4f
-     addu   t5, t8, s1   // t5 = outend
-1:
-    lb      t2, 0(t3)    // t2 = invalue = *inptr++
-    addiu   t3, 1
-    beqz    a0, 3f
-     move   t0, a0       // t0 = h_expand
-2:
-    sb      t2, 0(t8)
-    addiu   t0, -1
-    bgtz    t0, 2b
-     addiu  t8, 1
-3:
-    bgt     t5, t8, 1b
-     nop
-4:
-    addiu   t9, a1, -1   // t9 = v_expand - 1
-    blez    t9, 9f
-     nop
-5:
-    lw      t3, 0(s0)
-    lw      t4, 4(s0)
-    subu    t0, s1, 0xF
-    blez    t0, 7f
-     addu   t5, t3, s1   // t5 = end address
-    andi    t7, s1, 0xF  // t7 = residual
-    subu    t8, t5, t7
-6:
-    ulw     t0, 0(t3)
-    ulw     t1, 4(t3)
-    ulw     t2, 8(t3)
-    usw     t0, 0(t4)
-    ulw     t0, 12(t3)
-    usw     t1, 4(t4)
-    usw     t2, 8(t4)
-    usw     t0, 12(t4)
-    addiu   t3, 16
-    bne     t3, t8, 6b
-     addiu  t4, 16
-    beqz    t7, 8f
-     nop
-7:
-    lbu     t0, 0(t3)
-    sb      t0, 0(t4)
-    addiu   t3, 1
-    bne     t3, t5, 7b
-     addiu  t4, 1
-8:
-    addiu   t9, -1
-    bgtz    t9, 5b
-     addiu  s0, 8
-9:
-    addu    s3, s3, a1
-    bne     s3, s2, 0b
-     addiu  t6, 1
-10:
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j       ra
-     nop
-END(jsimd_int_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - cinfo->output_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-    lw      t7, 0(a3)       // t7 = output_data
-    andi    t8, a1, 0xf     // t8 = residual
-    sll     t0, a0, 2
-    blez    a0, 4f
-     addu   t9, t7, t0      // t9 = output_data end address
-0:
-    lw      t5, 0(t7)       // t5 = outptr
-    lw      t6, 0(a2)       // t6 = inptr
-    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
-    subu    t3, t8          // t3 = end address - residual
-    beq     t5, t3, 2f
-     move   t4, t8
-1:
-    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
-    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
-    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
-    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
-    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
-    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
-    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
-    usw     t0, 0(t5)
-    usw     t1, 4(t5)
-    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
-    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
-    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
-    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
-    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
-    usw     t2, 8(t5)
-    usw     t0, 12(t5)
-    addiu   t5, 16
-    bne     t5, t3, 1b
-     addiu  t6, 8
-    beqz    t8, 3f
-     move   t4, t8
-2:
-    lbu     t1, 0(t6)
-    sb      t1, 0(t5)
-    sb      t1, 1(t5)
-    addiu   t4, -2
-    addiu   t6, 1
-    bgtz    t4, 2b
-     addiu  t5, 2
-3:
-    addiu   t7, 4
-    bne     t9, t7, 0b
-     addiu  a2, 4
-4:
-    j       ra
-     nop
-END(jsimd_h2v1_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
-/*
- * a0     - cinfo->max_v_samp_factor
- * a1     - cinfo->output_width
- * a2     - input_data
- * a3     - output_data_ptr
- */
-    lw      t7, 0(a3)
-    blez    a0, 7f
-     andi   t9, a1, 0xf     // t9 = residual
-0:
-    lw      t6, 0(a2)       // t6 = inptr
-    lw      t5, 0(t7)       // t5 = outptr
-    addu    t8, t5, a1      // t8 = outptr end address
-    subu    t8, t9          // t8 = end address - residual
-    beq     t5, t8, 2f
-     move   t4, t9
-1:
-    ulw     t0, 0(t6)
-    srl     t1, t0, 16
-    ins     t0, t0, 16, 16
-    ins     t0, t0, 8, 16
-    ins     t1, t1, 16, 16
-    ins     t1, t1, 8, 16
-    ulw     t2, 4(t6)
-    usw     t0, 0(t5)
-    usw     t1, 4(t5)
-    srl     t3, t2, 16
-    ins     t2, t2, 16, 16
-    ins     t2, t2, 8, 16
-    ins     t3, t3, 16, 16
-    ins     t3, t3, 8, 16
-    usw     t2, 8(t5)
-    usw     t3, 12(t5)
-    addiu   t5, 16
-    bne     t5, t8, 1b
-     addiu  t6, 8
-    beqz    t9, 3f
-     move   t4, t9
-2:
-    lbu     t0, 0(t6)
-    sb      t0, 0(t5)
-    sb      t0, 1(t5)
-    addiu   t4, -2
-    addiu   t6, 1
-    bgtz    t4, 2b
-     addiu  t5, 2
-3:
-    lw      t6, 0(t7)       // t6 = outptr[0]
-    lw      t5, 4(t7)       // t5 = outptr[1]
-    addu    t4, t6, a1      // t4 = new end address
-    beq     a1, t9, 5f
-     subu   t8, t4, t9
-4:
-    ulw     t0, 0(t6)
-    ulw     t1, 4(t6)
-    ulw     t2, 8(t6)
-    usw     t0, 0(t5)
-    ulw     t0, 12(t6)
-    usw     t1, 4(t5)
-    usw     t2, 8(t5)
-    usw     t0, 12(t5)
-    addiu   t6, 16
-    bne     t6, t8, 4b
-     addiu  t5, 16
-    beqz    t9, 6f
-     nop
-5:
-    lbu     t0, 0(t6)
-    sb      t0, 0(t5)
-    addiu   t6, 1
-    bne     t6, t4, 5b
-     addiu  t5, 1
-6:
-    addiu   t7, 8
-    addiu   a0, -2
-    bgtz    a0, 0b
-     addiu  a2, 4
-7:
-    j       ra
-     nop
-END(jsimd_h2v2_upsample_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - compptr->dcttable
- * a2     - output
- * a3     - range_limit
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu     sp, sp, -256
-    move      v0, sp
-    addiu     v1, zero, 8      // v1 = DCTSIZE = 8
-1:
-    lh        s4, 32(a0)       // s4 = inptr[16]
-    lh        s5, 64(a0)       // s5 = inptr[32]
-    lh        s6, 96(a0)       // s6 = inptr[48]
-    lh        t1, 112(a0)      // t1 = inptr[56]
-    lh        t7, 16(a0)       // t7 = inptr[8]
-    lh        t5, 80(a0)       // t5 = inptr[40]
-    lh        t3, 48(a0)       // t3 = inptr[24]
-    or        s4, s4, t1
-    or        s4, s4, t3
-    or        s4, s4, t5
-    or        s4, s4, t7
-    or        s4, s4, s5
-    or        s4, s4, s6
-    bnez      s4, 2f
-     addiu    v1, v1, -1
-    lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
-    lh        s6, 0(a0)        // inptr[DCTSIZE*0]
-    mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
-    sll       s5, s5, 2
-    sw        s5, 0(v0)
-    sw        s5, 32(v0)
-    sw        s5, 64(v0)
-    sw        s5, 96(v0)
-    sw        s5, 128(v0)
-    sw        s5, 160(v0)
-    sw        s5, 192(v0)
-    b         3f
-     sw       s5, 224(v0)
-2:
-    lh        t0, 112(a1)
-    lh        t2, 48(a1)
-    lh        t4, 80(a1)
-    lh        t6, 16(a1)
-    mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
-    mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
-    mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
-    mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
-    lh        t4, 32(a1)
-    lh        t5, 32(a0)
-    lh        t6, 96(a1)
-    lh        t7, 96(a0)
-    addu      s0, t0, t1       // z3 = tmp0 + tmp2
-    addu      s1, t1, t2       // z2 = tmp1 + tmp2
-    addu      s2, t2, t3       // z4 = tmp1 + tmp3
-    addu      s3, s0, s2       // z3 + z4
-    addiu     t9, zero, 9633   // FIX_1_175875602
-    mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
-    addu      t8, t0, t3       // z1 = tmp0 + tmp3
-    addiu     t9, zero, 2446   // FIX_0_298631336
-    mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
-    addiu     t9, zero, 16819  // FIX_2_053119869
-    mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
-    addiu     t9, zero, 25172  // FIX_3_072711026
-    mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
-    addiu     t9, zero, 12299  // FIX_1_501321110
-    mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
-    addiu     t9, zero, 16069  // FIX_1_961570560
-    mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
-    addiu     t9, zero, 3196   // FIX_0_390180644
-    mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
-    addiu     t9, zero, 7373   // FIX_0_899976223
-    mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
-    addiu     t9, zero, 20995  // FIX_2_562915447
-    mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
-    subu      s0, s3, s0       // z3 += z5
-    addu      t0, t0, s0       // tmp0 += z3
-    addu      t1, t1, s0       // tmp2 += z3
-    subu      s2, s3, s2       // z4 += z5
-    addu      t2, t2, s2       // tmp1 += z4
-    addu      t3, t3, s2       // tmp3 += z4
-    subu      t0, t0, t8       // tmp0 += z1
-    subu      t1, t1, s1       // tmp2 += z2
-    subu      t2, t2, s1       // tmp1 += z2
-    subu      t3, t3, t8       // tmp3 += z1
-    mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
-    addiu     t9, zero, 6270   // FIX_0_765366865
-    mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
-    lh        t4, 0(a1)
-    lh        t5, 0(a0)
-    lh        t6, 64(a1)
-    lh        t7, 64(a0)
-    mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
-    mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
-    mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
-    addiu     t9, zero, 4433   // FIX_0_541196100
-    addu      s3, s0, s1       // z2 + z3
-    mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
-    addiu     t9, zero, 15137  // FIX_1_847759065
-    mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
-    addu      t4, t5, t6
-    subu      t5, t5, t6
-    sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
-    sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
-    addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
-    subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
-    addu      s0, t4, t7
-    subu      s1, t4, t7
-    addu      s2, t5, t6
-    subu      s3, t5, t6
-    addu      t4, s0, t3
-    subu      s0, s0, t3
-    addu      t3, s2, t1
-    subu      s2, s2, t1
-    addu      t1, s3, t2
-    subu      s3, s3, t2
-    addu      t2, s1, t0
-    subu      s1, s1, t0
-    shra_r.w  t4, t4, 11
-    shra_r.w  t3, t3, 11
-    shra_r.w  t1, t1, 11
-    shra_r.w  t2, t2, 11
-    shra_r.w  s1, s1, 11
-    shra_r.w  s3, s3, 11
-    shra_r.w  s2, s2, 11
-    shra_r.w  s0, s0, 11
-    sw        t4, 0(v0)
-    sw        t3, 32(v0)
-    sw        t1, 64(v0)
-    sw        t2, 96(v0)
-    sw        s1, 128(v0)
-    sw        s3, 160(v0)
-    sw        s2, 192(v0)
-    sw        s0, 224(v0)
-3:
-    addiu     a1, a1, 2
-    addiu     a0, a0, 2
-    bgtz      v1, 1b
-     addiu    v0, v0, 4
-    move      v0, sp
-    addiu     v1, zero, 8
-4:
-    lw        t0, 8(v0)        // z2 = (JLONG) wsptr[2]
-    lw        t1, 24(v0)       // z3 = (JLONG) wsptr[6]
-    lw        t2, 0(v0)        // (JLONG) wsptr[0]
-    lw        t3, 16(v0)       // (JLONG) wsptr[4]
-    lw        s4, 4(v0)        // (JLONG) wsptr[1]
-    lw        s5, 12(v0)       // (JLONG) wsptr[3]
-    lw        s6, 20(v0)       // (JLONG) wsptr[5]
-    lw        s7, 28(v0)       // (JLONG) wsptr[7]
-    or        s4, s4, t0
-    or        s4, s4, t1
-    or        s4, s4, t3
-    or        s4, s4, s7
-    or        s4, s4, s5
-    or        s4, s4, s6
-    bnez      s4, 5f
-     addiu    v1, v1, -1
-    shra_r.w  s5, t2, 5
-    andi      s5, s5, 0x3ff
-    lbux      s5, s5(a3)
-    lw        s1, 0(a2)
-    replv.qb  s5, s5
-    usw       s5, 0(s1)
-    usw       s5, 4(s1)
-    b         6f
-     nop
-5:
-    addu      t4, t0, t1       // z2 + z3
-    addiu     t8, zero, 4433   // FIX_0_541196100
-    mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
-    addiu     t8, zero, 15137  // FIX_1_847759065
-    mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
-    addiu     t8, zero, 6270   // FIX_0_765366865
-    mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
-    addu      t4, t2, t3       // (JLONG) wsptr[0] + (JLONG) wsptr[4]
-    subu      t2, t2, t3       // (JLONG) wsptr[0] - (JLONG) wsptr[4]
-    sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
-    sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
-    subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
-    subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
-    addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
-    addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
-    subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
-    addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
-    lw        t4, 28(v0)       // tmp0 = (JLONG) wsptr[7]
-    lw        t6, 12(v0)       // tmp2 = (JLONG) wsptr[3]
-    lw        t5, 20(v0)       // tmp1 = (JLONG) wsptr[5]
-    lw        t7, 4(v0)        // tmp3 = (JLONG) wsptr[1]
-    addu      s0, t4, t6       // z3 = tmp0 + tmp2
-    addiu     t8, zero, 9633   // FIX_1_175875602
-    addu      s1, t5, t7       // z4 = tmp1 + tmp3
-    addu      s2, s0, s1       // z3 + z4
-    mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
-    addu      s3, t4, t7       // z1 = tmp0 + tmp3
-    addu      t9, t5, t6       // z2 = tmp1 + tmp2
-    addiu     t8, zero, 16069  // FIX_1_961570560
-    mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
-    addiu     t8, zero, 3196   // FIX_0_390180644
-    mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
-    addiu     t8, zero, 2446   // FIX_0_298631336
-    mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
-    addiu     t8, zero, 7373   // FIX_0_899976223
-    mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
-    addiu     t8, zero, 16819  // FIX_2_053119869
-    mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
-    addiu     t8, zero, 20995  // FIX_2_562915447
-    mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
-    addiu     t8, zero, 25172  // FIX_3_072711026
-    mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
-    addiu     t8, zero, 12299  // FIX_1_501321110
-    mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
-    subu      s0, s2, s0       // z3 += z5
-    subu      s1, s2, s1       // z4 += z5
-    addu      t4, t4, s0
-    subu      t4, t4, s3       // tmp0
-    addu      t5, t5, s1
-    subu      t5, t5, t9       // tmp1
-    addu      t6, t6, s0
-    subu      t6, t6, t9       // tmp2
-    addu      t7, t7, s1
-    subu      t7, t7, s3       // tmp3
-    addu      s0, t0, t7
-    subu      t0, t0, t7
-    addu      t7, t2, t6
-    subu      t2, t2, t6
-    addu      t6, t3, t5
-    subu      t3, t3, t5
-    addu      t5, t1, t4
-    subu      t1, t1, t4
-    shra_r.w  s0, s0, 18
-    shra_r.w  t7, t7, 18
-    shra_r.w  t6, t6, 18
-    shra_r.w  t5, t5, 18
-    shra_r.w  t1, t1, 18
-    shra_r.w  t3, t3, 18
-    shra_r.w  t2, t2, 18
-    shra_r.w  t0, t0, 18
-    andi      s0, s0, 0x3ff
-    andi      t7, t7, 0x3ff
-    andi      t6, t6, 0x3ff
-    andi      t5, t5, 0x3ff
-    andi      t1, t1, 0x3ff
-    andi      t3, t3, 0x3ff
-    andi      t2, t2, 0x3ff
-    andi      t0, t0, 0x3ff
-    lw        s1, 0(a2)
-    lbux      s0, s0(a3)
-    lbux      t7, t7(a3)
-    lbux      t6, t6(a3)
-    lbux      t5, t5(a3)
-    lbux      t1, t1(a3)
-    lbux      t3, t3(a3)
-    lbux      t2, t2(a3)
-    lbux      t0, t0(a3)
-    sb        s0, 0(s1)
-    sb        t7, 1(s1)
-    sb        t6, 2(s1)
-    sb        t5, 3(s1)
-    sb        t1, 4(s1)
-    sb        t3, 5(s1)
-    sb        t2, 6(s1)
-    sb        t0, 7(s1)
-6:
-    addiu     v0, v0, 32
-    bgtz      v1, 4b
-     addiu    a2, a2, 4
-    addiu     sp, sp, 256
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-
-END(jsimd_idct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
-/*
- * a0     - inptr
- * a1     - quantptr
- * a2     - wsptr
- * a3     - mips_idct_ifast_coefs
- */
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu          t9, a0, 16            // end address
-    or             AT, a3, zero
-
-0:
-    lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
-    lw             t0, 0(a0)             // inptr[DCTSIZE*0]
-    lw             t1, 16(a0)            // inptr[DCTSIZE*1]
-    muleq_s.w.phl  v0, t0, s0            // tmp0 ...
-    lw             t2, 32(a0)            // inptr[DCTSIZE*2]
-    lw             t3, 48(a0)            // inptr[DCTSIZE*3]
-    lw             t4, 64(a0)            // inptr[DCTSIZE*4]
-    lw             t5, 80(a0)            // inptr[DCTSIZE*5]
-    muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
-    lw             t6, 96(a0)            // inptr[DCTSIZE*6]
-    lw             t7, 112(a0)           // inptr[DCTSIZE*7]
-    or             s4, t1, t2
-    or             s5, t3, t4
-    bnez           s4, 1f
-     ins           t0, v0, 16, 16        // ... tmp0
-    bnez           s5, 1f
-     or            s6, t5, t6
-    or             s6, s6, t7
-    bnez           s6, 1f
-     sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
-    sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
-    sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
-    sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
-    sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
-    sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
-    sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
-    sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
-    addiu          a0, a0, 4
-    b              2f
-     addiu         a1, a1, 4
-
-1:
-    lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
-    lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
-    muleq_s.w.phl  v0, t2, s1            // tmp1 ...
-    muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
-    lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
-    lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
-    lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
-    muleq_s.w.phl  v1, t4, s2            // tmp2 ...
-    muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
-    lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
-    lw             t8, 4(AT)             // FIX(1.414213562)
-    ins            t2, v0, 16, 16        // ... tmp1
-    muleq_s.w.phl  v0, t6, s3            // tmp3 ...
-    muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
-    ins            t4, v1, 16, 16        // ... tmp2
-    addq.ph        s4, t0, t4            // tmp10
-    subq.ph        s5, t0, t4            // tmp11
-    ins            t6, v0, 16, 16        // ... tmp3
-    subq.ph        s6, t2, t6            // tmp12 ...
-    addq.ph        s7, t2, t6            // tmp13
-    mulq_s.ph      s6, s6, t8            // ... tmp12 ...
-    addq.ph        t0, s4, s7            // tmp0
-    subq.ph        t6, s4, s7            // tmp3
-    muleq_s.w.phl  v0, t1, s0            // tmp4 ...
-    muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
-    shll_s.ph      s6, s6, 1             // x2
-    lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
-    subq.ph        s6, s6, s7            // ... tmp12
-    muleq_s.w.phl  v1, t7, s3            // tmp7 ...
-    muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
-    ins            t1, v0, 16, 16        // ... tmp4
-    addq.ph        t2, s5, s6            // tmp1
-    subq.ph        t4, s5, s6            // tmp2
-    muleq_s.w.phl  v0, t5, s2            // tmp6 ...
-    muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
-    ins            t7, v1, 16, 16        // ... tmp7
-    addq.ph        s5, t1, t7            // z11
-    subq.ph        s6, t1, t7            // z12
-    muleq_s.w.phl  v1, t3, s1            // tmp5 ...
-    muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
-    ins            t5, v0, 16, 16        // ... tmp6
-    ins            t3, v1, 16, 16        // ... tmp5
-    addq.ph        s7, t5, t3            // z13
-    subq.ph        v0, t5, t3            // z10
-    addq.ph        t7, s5, s7            // tmp7
-    subq.ph        s5, s5, s7            // tmp11 ...
-    addq.ph        v1, v0, s6            // z5 ...
-    mulq_s.ph      s5, s5, t8            // ... tmp11
-    lw             t8, 8(AT)             // FIX(1.847759065)
-    lw             s4, 0(AT)             // FIX(1.082392200)
-    addq.ph        s0, t0, t7
-    subq.ph        s1, t0, t7
-    mulq_s.ph      v1, v1, t8            // ... z5
-    shll_s.ph      s5, s5, 1             // x2
-    lw             t8, 12(AT)            // FIX(-2.613125930)
-    sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
-    shll_s.ph      v0, v0, 1             // x4
-    mulq_s.ph      v0, v0, t8            // tmp12 ...
-    mulq_s.ph      s4, s6, s4            // tmp10 ...
-    shll_s.ph      v1, v1, 1             // x2
-    addiu          a0, a0, 4
-    addiu          a1, a1, 4
-    sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
-    shll_s.ph      s6, v0, 1             // x4
-    shll_s.ph      s4, s4, 1             // x2
-    addq.ph        s6, s6, v1            // ... tmp12
-    subq.ph        t5, s6, t7            // tmp6
-    subq.ph        s4, s4, v1            // ... tmp10
-    subq.ph        t3, s5, t5            // tmp5
-    addq.ph        s2, t2, t5
-    addq.ph        t1, s4, t3            // tmp4
-    subq.ph        s3, t2, t5
-    sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
-    sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
-    addq.ph        v0, t4, t3
-    subq.ph        v1, t4, t3
-    sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
-    sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
-    addq.ph        v0, t6, t1
-    subq.ph        v1, t6, t1
-    sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
-    sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
-
-2:
-    bne            a0, t9, 0b
-     addiu         a2, a2, 4
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j              ra
-     nop
-
-END(jsimd_idct_ifast_cols_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
-/*
- * a0     - wsptr
- * a1     - output_buf
- * a2     - output_col
- * a3     - mips_idct_ifast_coefs
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
-    addiu         t9, a0, 128        // end address
-    lui           s8, 0x8080
-    ori           s8, s8, 0x8080
-
-0:
-    lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
-    lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
-    lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
-    lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
-    lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
-    lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
-    lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
-    lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
-    lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
-    precrq.ph.w   t1, s0, t0         // B b
-    ins           t0, s0, 16, 16     // A a
-    bnez          t1, 1f
-     or           s0, t2, s2
-    bnez          s0, 1f
-     or           s0, t4, s4
-    bnez          s0, 1f
-     or           s0, t6, s6
-    bnez          s0, 1f
-     shll_s.ph    s0, t0, 2          // A a
-    lw            a3, 0(a1)
-    lw            AT, 4(a1)
-    precrq.ph.w   t0, s0, s0         // A A
-    ins           s0, s0, 16, 16     // a a
-    addu          a3, a3, a2
-    addu          AT, AT, a2
-    precrq.qb.ph  t0, t0, t0         // A A A A
-    precrq.qb.ph  s0, s0, s0         // a a a a
-    addu.qb       s0, s0, s8
-    addu.qb       t0, t0, s8
-    sw            s0, 0(a3)
-    sw            s0, 4(a3)
-    sw            t0, 0(AT)
-    sw            t0, 4(AT)
-    addiu         a0, a0, 32
-    bne           a0, t9, 0b
-     addiu        a1, a1, 8
-    b             2f
-     nop
-
-1:
-    precrq.ph.w   t3, s2, t2
-    ins           t2, s2, 16, 16
-    precrq.ph.w   t5, s4, t4
-    ins           t4, s4, 16, 16
-    precrq.ph.w   t7, s6, t6
-    ins           t6, s6, 16, 16
-    lw            t8, 4(AT)          // FIX(1.414213562)
-    addq.ph       s4, t0, t4         // tmp10
-    subq.ph       s5, t0, t4         // tmp11
-    subq.ph       s6, t2, t6         // tmp12 ...
-    addq.ph       s7, t2, t6         // tmp13
-    mulq_s.ph     s6, s6, t8         // ... tmp12 ...
-    addq.ph       t0, s4, s7         // tmp0
-    subq.ph       t6, s4, s7         // tmp3
-    shll_s.ph     s6, s6, 1          // x2
-    subq.ph       s6, s6, s7         // ... tmp12
-    addq.ph       t2, s5, s6         // tmp1
-    subq.ph       t4, s5, s6         // tmp2
-    addq.ph       s5, t1, t7         // z11
-    subq.ph       s6, t1, t7         // z12
-    addq.ph       s7, t5, t3         // z13
-    subq.ph       v0, t5, t3         // z10
-    addq.ph       t7, s5, s7         // tmp7
-    subq.ph       s5, s5, s7         // tmp11 ...
-    addq.ph       v1, v0, s6         // z5 ...
-    mulq_s.ph     s5, s5, t8         // ... tmp11
-    lw            t8, 8(AT)          // FIX(1.847759065)
-    lw            s4, 0(AT)          // FIX(1.082392200)
-    addq.ph       s0, t0, t7         // tmp0 + tmp7
-    subq.ph       s7, t0, t7         // tmp0 - tmp7
-    mulq_s.ph     v1, v1, t8         // ... z5
-    lw            a3, 0(a1)
-    lw            t8, 12(AT)         // FIX(-2.613125930)
-    shll_s.ph     s5, s5, 1          // x2
-    addu          a3, a3, a2
-    shll_s.ph     v0, v0, 1          // x4
-    mulq_s.ph     v0, v0, t8         // tmp12 ...
-    mulq_s.ph     s4, s6, s4         // tmp10 ...
-    shll_s.ph     v1, v1, 1          // x2
-    addiu         a0, a0, 32
-    addiu         a1, a1, 8
-    shll_s.ph     s6, v0, 1          // x4
-    shll_s.ph     s4, s4, 1          // x2
-    addq.ph       s6, s6, v1         // ... tmp12
-    shll_s.ph     s0, s0, 2
-    subq.ph       t5, s6, t7         // tmp6
-    subq.ph       s4, s4, v1         // ... tmp10
-    subq.ph       t3, s5, t5         // tmp5
-    shll_s.ph     s7, s7, 2
-    addq.ph       t1, s4, t3         // tmp4
-    addq.ph       s1, t2, t5         // tmp1 + tmp6
-    subq.ph       s6, t2, t5         // tmp1 - tmp6
-    addq.ph       s2, t4, t3         // tmp2 + tmp5
-    subq.ph       s5, t4, t3         // tmp2 - tmp5
-    addq.ph       s4, t6, t1         // tmp3 + tmp4
-    subq.ph       s3, t6, t1         // tmp3 - tmp4
-    shll_s.ph     s1, s1, 2
-    shll_s.ph     s2, s2, 2
-    shll_s.ph     s3, s3, 2
-    shll_s.ph     s4, s4, 2
-    shll_s.ph     s5, s5, 2
-    shll_s.ph     s6, s6, 2
-    precrq.ph.w   t0, s1, s0         // B A
-    ins           s0, s1, 16, 16     // b a
-    precrq.ph.w   t2, s3, s2         // D C
-    ins           s2, s3, 16, 16     // d c
-    precrq.ph.w   t4, s5, s4         // F E
-    ins           s4, s5, 16, 16     // f e
-    precrq.ph.w   t6, s7, s6         // H G
-    ins           s6, s7, 16, 16     // h g
-    precrq.qb.ph  t0, t2, t0         // D C B A
-    precrq.qb.ph  s0, s2, s0         // d c b a
-    precrq.qb.ph  t4, t6, t4         // H G F E
-    precrq.qb.ph  s4, s6, s4         // h g f e
-    addu.qb       s0, s0, s8
-    addu.qb       s4, s4, s8
-    sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
-    sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
-    lw            a3, -4(a1)
-    addu.qb       t0, t0, s8
-    addu          a3, a3, a2
-    addu.qb       t4, t4, s8
-    sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
-    bne           a0, t9, 0b
-     sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
-
-2:
-
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
-
-    j             ra
-     nop
-
-END(jsimd_idct_ifast_rows_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
-/*
- * a0     - data
- */
-
-    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
-    lui       t0, 6437
-    ori       t0, 2260
-    lui       t1, 9633
-    ori       t1, 11363
-    lui       t2, 0xd39e
-    ori       t2, 0xe6dc
-    lui       t3, 0xf72d
-    ori       t3, 9633
-    lui       t4, 2261
-    ori       t4, 9633
-    lui       t5, 0xd39e
-    ori       t5, 6437
-    lui       t6, 9633
-    ori       t6, 0xd39d
-    lui       t7, 0xe6dc
-    ori       t7, 2260
-    lui       t8, 4433
-    ori       t8, 10703
-    lui       t9, 0xd630
-    ori       t9, 4433
-    li        s8, 8
-    move      a1, a0
-1:
-    lw        s0, 0(a1)     // tmp0 = 1|0
-    lw        s1, 4(a1)     // tmp1 = 3|2
-    lw        s2, 8(a1)     // tmp2 = 5|4
-    lw        s3, 12(a1)    // tmp3 = 7|6
-    packrl.ph s1, s1, s1    // tmp1 = 2|3
-    packrl.ph s3, s3, s3    // tmp3 = 6|7
-    subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
-    subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
-    mult      $0, $0        // ac0  = 0
-    dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
-    dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
-    mult      $ac1, $0, $0  // ac1  = 0
-    dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
-    dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
-    mult      $ac2, $0, $0  // ac2  = 0
-    dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
-    dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
-    mult      $ac3, $0, $0  // ac3  = 0
-    dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
-    dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
-    addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
-    addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
-    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
-    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
-    extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
-    extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
-    addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
-    subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
-    sh        s0, 2(a1)
-    sh        s1, 6(a1)
-    sh        s2, 10(a1)
-    sh        s3, 14(a1)
-    mult      $0, $0        // ac0  = 0
-    dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
-    mult      $ac1, $0, $0  // ac1  = 0
-    dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
-    sra       s4, s5, 16    // tmp4 = t11
-    addiu     a1, a1, 16
-    addiu     s8, s8, -1
-    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
-    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
-    addu      s2, s5, s4    // tmp2 = t10 + t11
-    subu      s3, s5, s4    // tmp3 = t10 - t11
-    sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
-    sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
-    sh        s2, -16(a1)
-    sh        s3, -8(a1)
-    sh        s0, -12(a1)
-    bgtz      s8, 1b
-     sh       s1, -4(a1)
-    li        t0, 2260
-    li        t1, 11363
-    li        t2, 9633
-    li        t3, 6436
-    li        t4, 6437
-    li        t5, 2261
-    li        t6, 11362
-    li        t7, 2259
-    li        t8, 4433
-    li        t9, 10703
-    li        a1, 10704
-    li        s8, 8
-
-2:
-    lh        a2, 0(a0)     // 0
-    lh        a3, 16(a0)    // 8
-    lh        v0, 32(a0)    // 16
-    lh        v1, 48(a0)    // 24
-    lh        s4, 64(a0)    // 32
-    lh        s5, 80(a0)    // 40
-    lh        s6, 96(a0)    // 48
-    lh        s7, 112(a0)   // 56
-    addu      s2, v0, s5    // tmp2 = 16 + 40
-    subu      s5, v0, s5    // tmp5 = 16 - 40
-    addu      s3, v1, s4    // tmp3 = 24 + 32
-    subu      s4, v1, s4    // tmp4 = 24 - 32
-    addu      s0, a2, s7    // tmp0 =  0 + 56
-    subu      s7, a2, s7    // tmp7 =  0 - 56
-    addu      s1, a3, s6    // tmp1 =  8 + 48
-    subu      s6, a3, s6    // tmp6 =  8 - 48
-    addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
-    subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
-    addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
-    subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
-    mult      s7, t1        // ac0  = tmp7 * c1
-    madd      s4, t0        // ac0 += tmp4 * c0
-    madd      s5, t4        // ac0 += tmp5 * c4
-    madd      s6, t2        // ac0 += tmp6 * c2
-    mult      $ac1, s7, t2  // ac1  = tmp7 * c2
-    msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
-    msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
-    msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
-    mult      $ac2, s7, t4  // ac2  = tmp7 * c4
-    madd      $ac2, s4, t2  // ac2 += tmp4 * c2
-    madd      $ac2, s5, t5  // ac2 += tmp5 * c5
-    msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
-    mult      $ac3, s7, t0  // ac3  = tmp7 * c0
-    msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
-    madd      $ac3, s5, t2  // ac3 += tmp5 * c2
-    msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
-    extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
-    extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
-    extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
-    extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
-    addiu     s8, s8, -1
-    addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
-    subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
-    sh        s0, 16(a0)
-    sh        s1, 48(a0)
-    sh        s2, 80(a0)
-    sh        s3, 112(a0)
-    mult      v0, t8        // ac0  = tmp12 * c8
-    madd      v1, t9        // ac0 += tmp13 * c9
-    mult      $ac1, v1, t8  // ac1  = tmp13 * c8
-    msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
-    addiu     a0, a0, 2
-    extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
-    extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
-    shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
-    shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
-    sh        s4, -2(a0)
-    sh        s5, 62(a0)
-    sh        s6, 30(a0)
-    bgtz      s8, 2b
-     sh       s7, 94(a0)
-
-    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
-
-    jr       ra
-     nop
-
-END(jsimd_fdct_islow_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
-/*
- * a0     - data
- */
-    .set at
-    SAVE_REGS_ON_STACK 8, s0, s1
-    li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
-    li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
-    li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
-    li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
-
-    move         v0, a0
-    addiu        v1, v0, 128     // end address
-
-0:
-    lw           t0, 0(v0)       // tmp0 = 1|0
-    lw           t1, 4(v0)       // tmp1 = 3|2
-    lw           t2, 8(v0)       // tmp2 = 5|4
-    lw           t3, 12(v0)      // tmp3 = 7|6
-    packrl.ph    t1, t1, t1      // tmp1 = 2|3
-    packrl.ph    t3, t3, t3      // tmp3 = 6|7
-    subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
-    subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
-    addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
-    addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
-    addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
-    subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
-    sra          t4, t8, 16      // tmp4 = t11
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t9, s1
-    mult         $ac1, $0, $0    // ac1  = 0
-    dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
-    dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
-    mult         $ac2, $0, $0    // ac2  = 0
-    dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
-    mult         $ac3, $0, $0    // ac3  = 0
-    dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
-    precrq.ph.w  t0, t5, t7      // t0 = t5|t6
-    addq.ph      t2, t8, t4      // tmp2 = t10 + t11
-    subq.ph      t3, t8, t4      // tmp3 = t10 - t11
-    extr.w       t4, $ac0, 8
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
-    extr.w       t0, $ac1, 8     // t0 = z5
-    extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
-    extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
-    extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
-    add          t6, t1, t0      // t6 = z2
-    add          t7, t7, t0      // t7 = z4
-    subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
-    addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
-    addq.ph      t1, t0, t6      // t1 = z13 + z2
-    subq.ph      t6, t0, t6      // t6 = z13 - z2
-    addq.ph      t0, t8, t7      // t0 = z11 + z4
-    subq.ph      t7, t8, t7      // t7 = z11 - z4
-    addq.ph      t5, t4, t9
-    subq.ph      t4, t9, t4
-    sh           t2, 0(v0)
-    sh           t5, 4(v0)
-    sh           t3, 8(v0)
-    sh           t4, 12(v0)
-    sh           t1, 10(v0)
-    sh           t6, 6(v0)
-    sh           t0, 2(v0)
-    sh           t7, 14(v0)
-    addiu        v0, 16
-    bne          v1, v0, 0b
-     nop
-    move         v0, a0
-    addiu        v1, v0, 16
-
-1:
-    lh           t0, 0(v0)       // 0
-    lh           t1, 16(v0)      // 8
-    lh           t2, 32(v0)      // 16
-    lh           t3, 48(v0)      // 24
-    lh           t4, 64(v0)      // 32
-    lh           t5, 80(v0)      // 40
-    lh           t6, 96(v0)      // 48
-    lh           t7, 112(v0)     // 56
-    add          t8, t0, t7      // t8 = tmp0
-    sub          t7, t0, t7      // t7 = tmp7
-    add          t0, t1, t6      // t0 = tmp1
-    sub          t1, t1, t6      // t1 = tmp6
-    add          t6, t2, t5      // t6 = tmp2
-    sub          t5, t2, t5      // t5 = tmp5
-    add          t2, t3, t4      // t2 = tmp3
-    sub          t3, t3, t4      // t3 = tmp4
-    add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
-    sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
-    sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
-    ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
-    add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
-    mult         $0, $0          // ac0  = 0
-    dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
-    add          s0, t4, t2      // t8 = tmp10+tmp11
-    sub          t4, t4, t2      // t4 = tmp10-tmp11
-    sh           s0, 0(v0)
-    sh           t4, 64(v0)
-    extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
-    addq.ph      t4, t8, t2      // t9 = tmp13 + z1
-    subq.ph      t8, t8, t2      // t2 = tmp13 - z1
-    sh           t4, 32(v0)
-    sh           t8, 96(v0)
-    add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
-    add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
-    add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
-    andi         t4, a1, 0xffff
-    mul          s0, t1, t4
-    sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
-    ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
-    mult         $0, $0          // ac0  = 0
-    mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
-    extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
-    add          t2, t7, t8      // t2 = tmp7 + z5
-    sub          t7, t7, t8      // t7 = tmp7 - z5
-    andi         t4, a2, 0xffff
-    mul          t8, t3, t4
-    sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
-    andi         t4, s1, 0xffff
-    mul          t6, t0, t4
-    sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
-    add          t0, t6, t8      // t0 = z3 + z2
-    sub          t1, t6, t8      // t1 = z3 - z2
-    add          t3, t6, s0      // t3 = z3 + z4
-    sub          t4, t6, s0      // t4 = z3 - z4
-    sub          t5, t2, t1      // t5 = dataptr[5]
-    sub          t6, t7, t0      // t6 = dataptr[3]
-    add          t3, t2, t3      // t3 = dataptr[1]
-    add          t4, t7, t4      // t4 = dataptr[7]
-    sh           t5, 80(v0)
-    sh           t6, 48(v0)
-    sh           t3, 16(v0)
-    sh           t4, 112(v0)
-    addiu        v0, 2
-    bne          v0, v1, 1b
-     nop
-
-    RESTORE_REGS_FROM_STACK 8, s0, s1
-
-    j            ra
-     nop
-END(jsimd_fdct_ifast_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - divisors
- * a2     - workspace
- */
-
-    .set at
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2
-
-    addiu   v0, a2, 124  // v0 = workspace_end
-    lh      t0, 0(a2)
-    lh      t1, 0(a1)
-    lh      t2, 128(a1)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0, t3
-    lh      t4, 384(a1)
-    lh      t5, 130(a1)
-    lh      t6, 2(a2)
-    lh      t7, 2(a1)
-    lh      t8, 386(a1)
-
-1:
-    andi    t1, 0xffff
-    add     t9, t0, t2
-    andi    t9, 0xffff
-    mul     v1, t9, t1
-    sra     s0, t6, 15
-    sll     s0, s0, 1
-    addiu   s0, s0, 1
-    addiu   t9, t4, 16
-    srav    v1, v1, t9
-    mul     v1, v1, t3
-    mul     t6, t6, s0
-    andi    t7, 0xffff
-    addiu   a2, a2, 4
-    addiu   a1, a1, 4
-    add     s1, t6, t5
-    andi    s1, 0xffff
-    sh      v1, 0(a0)
-
-    mul     s2, s1, t7
-    addiu   s1, t8, 16
-    srav    s2, s2, s1
-    mul     s2,s2, s0
-    lh      t0, 0(a2)
-    lh      t1, 0(a1)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0, t3
-    lh      t2, 128(a1)
-    lh      t4, 384(a1)
-    lh      t5, 130(a1)
-    lh      t8, 386(a1)
-    lh      t6, 2(a2)
-    lh      t7, 2(a1)
-    sh      s2, 2(a0)
-    lh      t0, 0(a2)
-    sra     t3, t0, 15
-    sll     t3, t3, 1
-    addiu   t3, t3, 1
-    mul     t0, t0,t3
-    bne     a2, v0, 1b
-     addiu  a0, a0, 4
-
-    andi    t1, 0xffff
-    add     t9, t0, t2
-    andi    t9, 0xffff
-    mul     v1, t9, t1
-    sra     s0, t6, 15
-    sll     s0, s0, 1
-    addiu   s0, s0, 1
-    addiu   t9, t4, 16
-    srav    v1, v1, t9
-    mul     v1, v1, t3
-    mul     t6, t6, s0
-    andi    t7, 0xffff
-    sh      v1, 0(a0)
-    add     s1, t6, t5
-    andi    s1, 0xffff
-    mul     s2, s1, t7
-    addiu   s1, t8, 16
-    addiu   a2, a2, 4
-    addiu   a1, a1, 4
-    srav    s2, s2, s1
-    mul     s2, s2, s0
-    sh      s2, 2(a0)
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
-
-    j       ra
-     nop
-
-END(jsimd_quantize_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
-/*
- * a0     - coef_block
- * a1     - divisors
- * a2     - workspace
- */
-
-    .set at
-
-    li         t1, 0x46800100     //integer representation 16384.5
-    mtc1       t1, f0
-    li         t0, 63
-0:
-    lwc1       f2, 0(a2)
-    lwc1       f10, 0(a1)
-    lwc1       f4, 4(a2)
-    lwc1       f12, 4(a1)
-    lwc1       f6, 8(a2)
-    lwc1       f14, 8(a1)
-    lwc1       f8, 12(a2)
-    lwc1       f16, 12(a1)
-    madd.s     f2, f0, f2, f10
-    madd.s     f4, f0, f4, f12
-    madd.s     f6, f0, f6, f14
-    madd.s     f8, f0, f8, f16
-    lwc1       f10, 16(a1)
-    lwc1       f12, 20(a1)
-    trunc.w.s  f2, f2
-    trunc.w.s  f4, f4
-    trunc.w.s  f6, f6
-    trunc.w.s  f8, f8
-    lwc1       f14, 24(a1)
-    lwc1       f16, 28(a1)
-    mfc1       t1, f2
-    mfc1       t2, f4
-    mfc1       t3, f6
-    mfc1       t4, f8
-    lwc1       f2, 16(a2)
-    lwc1       f4, 20(a2)
-    lwc1       f6, 24(a2)
-    lwc1       f8, 28(a2)
-    madd.s     f2, f0, f2, f10
-    madd.s     f4, f0, f4, f12
-    madd.s     f6, f0, f6, f14
-    madd.s     f8, f0, f8, f16
-    addiu      t1, t1, -16384
-    addiu      t2, t2, -16384
-    addiu      t3, t3, -16384
-    addiu      t4, t4, -16384
-    trunc.w.s  f2, f2
-    trunc.w.s  f4, f4
-    trunc.w.s  f6, f6
-    trunc.w.s  f8, f8
-    sh         t1, 0(a0)
-    sh         t2, 2(a0)
-    sh         t3, 4(a0)
-    sh         t4, 6(a0)
-    mfc1       t1, f2
-    mfc1       t2, f4
-    mfc1       t3, f6
-    mfc1       t4, f8
-    addiu      t0, t0, -8
-    addiu      a2, a2, 32
-    addiu      a1, a1, 32
-    addiu      t1, t1, -16384
-    addiu      t2, t2, -16384
-    addiu      t3, t3, -16384
-    addiu      t4, t4, -16384
-    sh         t1, 8(a0)
-    sh         t2, 10(a0)
-    sh         t3, 12(a0)
-    sh         t4, 14(a0)
-    bgez       t0, 0b
-     addiu     a0, a0, 16
-
-    j          ra
-     nop
-
-END(jsimd_quantize_float_mips_dspr2)
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
-
-    addiu     sp, sp, -40
-    move      v0, sp
-    addiu     s2, zero, 29692
-    addiu     s3, zero, -10426
-    addiu     s4, zero, 6967
-    addiu     s5, zero, -5906
-    lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
-    lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
-    lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
-    lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
-    mul       t4, t5, t0
-    lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
-    lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
-    mul       t6, t6, t1
-    mul       t5, t5, t0
-    lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
-    lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
-    lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
-    lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
-    mul       t7, t7, t2
-    mult      zero, zero
-    mul       t8, t8, t3
-    li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
-    li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
-    ins       t6, t5, 16, 16    // t6 = t5|t6
-    sll       t4, t4, 15
-    dpa.w.ph  $ac0, t6, s0
-    lh        t1, 2(a1)
-    lh        t6, 2(a0)
-    ins       t8, t7, 16, 16    // t8 = t7|t8
-    dpa.w.ph  $ac0, t8, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 18(a1)
-    lh        t6, 18(a0)
-    lh        t2, 50(a1)
-    lh        t7, 50(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 82(a1)
-    lh        t2, 82(a0)
-    lh        t3, 114(a1)
-    lh        t4, 114(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 0(v0)
-    sw        t8, 20(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 6(a1)
-    lh        t6, 6(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 22(a1)
-    lh        t6, 22(a0)
-    lh        t2, 54(a1)
-    lh        t7, 54(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 86(a1)
-    lh        t2, 86(a0)
-    lh        t3, 118(a1)
-    lh        t4, 118(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 4(v0)
-    sw        t8, 24(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 10(a1)
-    lh        t6, 10(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 26(a1)
-    lh        t6, 26(a0)
-    lh        t2, 58(a1)
-    lh        t7, 58(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 90(a1)
-    lh        t2, 90(a0)
-    lh        t3, 122(a1)
-    lh        t4, 122(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 8(v0)
-    sw        t8, 28(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    lh        t1, 14(a1)
-    lh        t6, 14(a0)
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    mul       t5, t6, t1
-    lh        t1, 30(a1)
-    lh        t6, 30(a0)
-    lh        t2, 62(a1)
-    lh        t7, 62(a0)
-    mul       t6, t6, t1
-    subu      t8, t4, t0
-    mul       t7, t7, t2
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    lh        t1, 94(a1)
-    lh        t2, 94(a0)
-    lh        t3, 126(a1)
-    lh        t4, 126(a0)
-    shra_r.w  t8, t8, 13
-    mul       t1, t1, t2
-    mul       t3, t3, t4
-    sw        t0, 12(v0)
-    sw        t8, 32(v0)
-    sll       t4, t5, 15
-    ins       t7, t6, 16, 16
-    mult      zero, zero
-    dpa.w.ph  $ac0, t7, s0
-    ins       t3, t1, 16, 16
-    dpa.w.ph  $ac0, t3, s1
-    mflo      t0, $ac0
-    lw        t9, 0(a2)
-    lw        t3, 0(v0)
-    lw        t7, 4(v0)
-    lw        t1, 8(v0)
-    addu      t9, t9, a3
-    sll       t3, t3, 15
-    subu      t8, t4, t0
-    addu      t0, t4, t0
-    shra_r.w  t0, t0, 13
-    shra_r.w  t8, t8, 13
-    sw        t0, 16(v0)
-    sw        t8, 36(v0)
-    lw        t5, 12(v0)
-    lw        t6, 16(v0)
-    mult      t7, s2
-    madd      t1, s3
-    madd      t5, s4
-    madd      t6, s5
-    lw        t5, 24(v0)
-    lw        t7, 28(v0)
-    mflo      t0, $ac0
-    lw        t8, 32(v0)
-    lw        t2, 36(v0)
-    mult      $ac1, t5, s2
-    madd      $ac1, t7, s3
-    madd      $ac1, t8, s4
-    madd      $ac1, t2, s5
-    addu      t1, t3, t0
-    subu      t6, t3, t0
-    shra_r.w  t1, t1, 20
-    shra_r.w  t6, t6, 20
-    mflo      t4, $ac1
-    shll_s.w  t1, t1, 24
-    shll_s.w  t6, t6, 24
-    sra       t1, t1, 24
-    sra       t6, t6, 24
-    addiu     t1, t1, 128
-    addiu     t6, t6, 128
-    lw        t0, 20(v0)
-    sb        t1, 0(t9)
-    sb        t6, 1(t9)
-    sll       t0, t0, 15
-    lw        t9, 4(a2)
-    addu      t1, t0, t4
-    subu      t6, t0, t4
-    addu      t9, t9, a3
-    shra_r.w  t1, t1, 20
-    shra_r.w  t6, t6, 20
-    shll_s.w  t1, t1, 24
-    shll_s.w  t6, t6, 24
-    sra       t1, t1, 24
-    sra       t6, t6, 24
-    addiu     t1, t1, 128
-    addiu     t6, t6, 128
-    sb        t1, 0(t9)
-    sb        t6, 1(t9)
-    addiu     sp, sp, 40
-
-    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
-
-    j         ra
-     nop
-
-END(jsimd_idct_2x2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
- */
-
-    .set at
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    lw        v1, 48(sp)
-    move      t0, a1
-    move      t1, v1
-    li        t9, 4
-    li        s0, 0x2e75f93e
-    li        s1, 0x21f9ba79
-    li        s2, 0xecc2efb0
-    li        s3, 0x52031ccd
-
-0:
-    lh        s6, 32(t0)        // inptr[DCTSIZE*2]
-    lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
-    lh        s7, 96(t0)        // inptr[DCTSIZE*6]
-    lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        s4, 0(t0)         // inptr[DCTSIZE*0]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s5, 0(a0)         // quantptr[0]
-    li        s6, 15137
-    li        s7, 6270
-    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        t5, 112(t0)       // inptr[DCTSIZE*7]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
-    lh        v0, 80(t0)        // inptr[DCTSIZE*5]
-    lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
-    lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
-    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
-    lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
-    lh        t8, 16(t0)        // inptr[DCTSIZE*1]
-    subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
-    lh        t7, 48(t0)        // inptr[DCTSIZE*3]
-    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
-    mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
-    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
-    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
-    addu      t3, t2, t6        // tmp10 = tmp0 + z2
-    subu      t4, t2, t6        // tmp10 = tmp0 - z2
-    mult      $ac0, zero, zero
-    mult      $ac1, zero, zero
-    ins       t5, v0, 16, 16
-    ins       t7, t8, 16, 16
-    addiu     t9, t9, -1
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    mflo      s4, $ac0
-    mflo      s5, $ac1
-    addiu     a0, a0, 2
-    addiu     t1, t1, 4
-    addiu     t0, t0, 2
-    addu      t6, t4, s4
-    subu      t5, t4, s4
-    addu      s6, t3, s5
-    subu      s7, t3, s5
-    shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
-    shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
-    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
-    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
-    sw        t6, 28(t1)
-    sw        t5, 60(t1)
-    sw        s6, -4(t1)
-    bgtz      t9, 0b
-     sw       s7, 92(t1)
-    // second loop three pass
-    li        t9, 3
-1:
-    lh        s6, 34(t0)        // inptr[DCTSIZE*2]
-    lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
-    lh        s7, 98(t0)        // inptr[DCTSIZE*6]
-    lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
-    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        s4, 2(t0)         // inptr[DCTSIZE*0]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
-    li        s6, 15137
-    li        s7, 6270
-    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
-    mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
-    lh        t5, 114(t0)       // inptr[DCTSIZE*7]
-    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
-    lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
-    lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
-    lh        t6, 82(t0)        // inptr[DCTSIZE*5]
-    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
-    lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
-    lh        t8, 18(t0)        // inptr[DCTSIZE*1]
-    subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
-    lh        t7, 50(t0)        // inptr[DCTSIZE*3]
-    lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
-    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
-    mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
-    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
-    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
-    addu      t3, t2, v0        // tmp10 = tmp0 + z2
-    subu      t4, t2, v0        // tmp10 = tmp0 - z2
-    mult      $ac0, zero, zero
-    mult      $ac1, zero, zero
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    mflo      t5, $ac0
-    mflo      t6, $ac1
-    addiu     t9, t9, -1
-    addiu     t0, t0, 2
-    addiu     a0, a0, 2
-    addiu     t1, t1, 4
-    addu      s5, t4, t5
-    subu      s4, t4, t5
-    addu      s6, t3, t6
-    subu      s7, t3, t6
-    shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
-    shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
-    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
-    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
-    sw        s5, 32(t1)
-    sw        s4, 64(t1)
-    sw        s6, 0(t1)
-    bgtz      t9, 1b
-     sw       s7, 96(t1)
-    move      t1, v1
-    li        s4, 15137
-    lw        s6, 8(t1)         // wsptr[2]
-    li        s5, 6270
-    lw        s7, 24(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 0(t1)         // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 28(t1)        // wsptr[7]
-    lh        t6, 20(t1)        // wsptr[5]
-    lh        t7, 12(t1)        // wsptr[3]
-    lh        t8, 4(t1)         // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 0(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    // 2
-    li        s4, 15137
-    lw        s6, 40(t1)        // wsptr[2]
-    li        s5, 6270
-    lw        s7, 56(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 32(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 60(t1)        // wsptr[7]
-    lh        t6, 52(t1)        // wsptr[5]
-    lh        t7, 44(t1)        // wsptr[3]
-    lh        t8, 36(t1)        // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
-    sll       s4, t9, 2
-    lw        v0, 4(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    // 3
-    li        s4, 15137
-    lw        s6, 72(t1)        // wsptr[2]
-    li        s5, 6270
-    lw        s7, 88(t1)        // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 64(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], - FIX_0_765366865)
-    lh        t5, 92(t1)        // wsptr[7]
-    lh        t6, 84(t1)        // wsptr[5]
-    lh        t7, 76(t1)        // wsptr[3]
-    lh        t8, 68(t1)        // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 8(a2)         // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-    li        s4, 15137
-    lw        s6, 104(t1)       // wsptr[2]
-    li        s5, 6270
-    lw        s7, 120(t1)       // wsptr[6]
-    mul       s4, s4, s6        // MULTIPLY((JLONG) wsptr[2], FIX_1_847759065)
-    lw        t2, 96(t1)        // wsptr[0]
-    mul       s5, s5, s7        // MULTIPLY((JLONG) wsptr[6], -FIX_0_765366865)
-    lh        t5, 124(t1)       // wsptr[7]
-    lh        t6, 116(t1)       // wsptr[5]
-    lh        t7, 108(t1)       // wsptr[3]
-    lh        t8, 100(t1)       // wsptr[1]
-    ins       t5, t6, 16, 16
-    ins       t7, t8, 16, 16
-    mult      $ac0, zero, zero
-    dpa.w.ph  $ac0, t5, s0
-    dpa.w.ph  $ac0, t7, s1
-    mult      $ac1, zero, zero
-    dpa.w.ph  $ac1, t5, s2
-    dpa.w.ph  $ac1, t7, s3
-    sll       t2, t2, 14        // tmp0 = ((JLONG) wsptr[0]) << (CONST_BITS+1)
-    mflo      s6, $ac0
-    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
-    subu      s4, s4, s5
-    addu      t3, t2, s4        // tmp10 = tmp0 + z2;
-    mflo      s7, $ac1
-    subu      t4, t2, s4        // tmp10 = tmp0 - z2;
-    addu      t7, t4, s6
-    subu      t8, t4, s6
-    addu      t5, t3, s7
-    subu      t6, t3, s7
-    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
-    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
-    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
-    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
-    sll       s4, t9, 2
-    lw        v0, 12(a2)        // output_buf[ctr]
-    shll_s.w  t5, t5, 24
-    shll_s.w  t6, t6, 24
-    shll_s.w  t7, t7, 24
-    shll_s.w  t8, t8, 24
-    sra       t5, t5, 24
-    sra       t6, t6, 24
-    sra       t7, t7, 24
-    sra       t8, t8, 24
-    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
-    addiu     t5, t5, 128
-    addiu     t6, t6, 128
-    addiu     t7, t7, 128
-    addiu     t8, t8, 128
-    sb        t5, 0(v0)
-    sb        t7, 1(v0)
-    sb        t8, 2(v0)
-    sb        t6, 3(v0)
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-END(jsimd_idct_4x4_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - output_buf
- * a3     - output_col
- */
-    .set at
-
-    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    addiu     sp, sp, -144
-    move      v0, sp
-    addiu     v1, v0, 24
-    addiu     t9, zero, 5793
-    addiu     s0, zero, 10033
-    addiu     s1, zero, 2998
-
-1:
-    lh        s2, 0(a0)   // q0 = quantptr[ 0]
-    lh        s3, 32(a0)  // q1 = quantptr[16]
-    lh        s4, 64(a0)  // q2 = quantptr[32]
-    lh        t2, 64(a1)  // tmp2 = inptr[32]
-    lh        t1, 32(a1)  // tmp1 = inptr[16]
-    lh        t0, 0(a1)   // tmp0 = inptr[ 0]
-    mul       t2, t2, s4  // tmp2 = tmp2 * q2
-    mul       t1, t1, s3  // tmp1 = tmp1 * q1
-    mul       t0, t0, s2  // tmp0 = tmp0 * q0
-    lh        t6, 16(a1)  // z1 = inptr[ 8]
-    lh        t8, 80(a1)  // z3 = inptr[40]
-    lh        t7, 48(a1)  // z2 = inptr[24]
-    lh        s2, 16(a0)  // q0 = quantptr[ 8]
-    lh        s4, 80(a0)  // q2 = quantptr[40]
-    lh        s3, 48(a0)  // q1 = quantptr[24]
-    mul       t2, t2, t9  // tmp2 = tmp2 * 5793
-    mul       t1, t1, s0  // tmp1 = tmp1 * 10033
-    sll       t0, t0, 13  // tmp0 = tmp0 << 13
-    mul       t6, t6, s2  // z1 = z1 * q0
-    mul       t8, t8, s4  // z3 = z3 * q2
-    mul       t7, t7, s3  // z2 = z2 * q1
-    addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
-    sll       t2, t2, 1   // tmp2 = tmp2 << 2
-    subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
-    subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
-    addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
-    addu      t1, t6, t8  // tmp1 = z1 + z3
-    mul       t1, t1, s1  // tmp1 = tmp1 * 2998
-    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
-    subu      t2, t6, t8  // tmp2 = z1 - z3
-    subu      t2, t2, t7  // tmp2 = tmp2 - z2
-    sll       t2, t2, 2   // tmp2 = tmp2 << 2
-    addu      t0, t6, t7  // tmp0 = z1 + z2
-    sll       t0, t0, 13  // tmp0 = tmp0 << 13
-    subu      s2, t8, t7  // q0 = z3 - z2
-    sll       s2, s2, 13  // q0 = q0 << 13
-    addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
-    addu      t1, s2, t1  // tmp1 = q0 + tmp1
-    addu      s2, t4, t2  // q0 = tmp11 + tmp2
-    subu      s3, t4, t2  // q1 = tmp11 - tmp2
-    addu      t6, t3, t0  // z1 = tmp10 + tmp0
-    subu      t7, t3, t0  // z2 = tmp10 - tmp0
-    addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
-    subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
-    shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
-    shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
-    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
-    shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
-    sw        s2, 24(v0)
-    sw        s3, 96(v0)
-    sw        t6, 0(v0)
-    sw        t7, 120(v0)
-    sw        t4, 48(v0)
-    sw        t5, 72(v0)
-    addiu     v0, v0, 4
-    addiu     a1, a1, 2
-    bne       v0, v1, 1b
-     addiu    a0, a0, 2
-
-    /* Pass 2: process 6 rows from work array, store into output array. */
-    move      v0, sp
-    addiu     v1, v0, 144
-
-2:
-    lw        t0, 0(v0)
-    lw        t2, 16(v0)
-    lw        s5, 0(a2)
-    addiu     t0, t0, 16
-    sll       t0, t0, 13
-    mul       t3, t2, t9
-    lw        t6, 4(v0)
-    lw        t8, 20(v0)
-    lw        t7, 12(v0)
-    addu      s5, s5, a3
-    addu      s6, t6, t8
-    mul       s6, s6, s1
-    addu      t1, t0, t3
-    subu      t4, t0, t3
-    subu      t4, t4, t3
-    lw        t3, 8(v0)
-    mul       t0, t3, s0
-    addu      s7, t6, t7
-    sll       s7, s7, 13
-    addu      s7, s6, s7
-    subu      t2, t8, t7
-    sll       t2, t2, 13
-    addu      t2, s6, t2
-    subu      s6, t6, t7
-    subu      s6, s6, t8
-    sll       s6, s6, 13
-    addu      t3, t1, t0
-    subu      t5, t1, t0
-    addu      t6, t3, s7
-    subu      t3, t3, s7
-    addu      t7, t4, s6
-    subu      t4, t4, s6
-    addu      t8, t5, t2
-    subu      t5, t5, t2
-    shll_s.w  t6, t6, 6
-    shll_s.w  t3, t3, 6
-    shll_s.w  t7, t7, 6
-    shll_s.w  t4, t4, 6
-    shll_s.w  t8, t8, 6
-    shll_s.w  t5, t5, 6
-    sra       t6, t6, 24
-    addiu     t6, t6, 128
-    sra       t3, t3, 24
-    addiu     t3, t3, 128
-    sb        t6, 0(s5)
-    sra       t7, t7, 24
-    addiu     t7, t7, 128
-    sb        t3, 5(s5)
-    sra       t4, t4, 24
-    addiu     t4, t4, 128
-    sb        t7, 1(s5)
-    sra       t8, t8, 24
-    addiu     t8, t8, 128
-    sb        t4, 4(s5)
-    addiu     v0, v0, 24
-    sra       t5, t5, 24
-    addiu     t5, t5, 128
-    sb        t8, 2(s5)
-    addiu     a2, a2,  4
-    bne       v0, v1, 2b
-     sb       t5, 3(s5)
-
-    addiu     sp, sp, 144
-
-    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
-
-    j         ra
-     nop
-
-END(jsimd_idct_6x6_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
-/*
- * a0     - compptr->dct_table
- * a1     - coef_block
- * a2     - workspace
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    li         a3, 8
-
-1:
-    // odd part
-    lh         t0, 48(a1)
-    lh         t1, 48(a0)
-    lh         t2, 16(a1)
-    lh         t3, 16(a0)
-    lh         t4, 80(a1)
-    lh         t5, 80(a0)
-    lh         t6, 112(a1)
-    lh         t7, 112(a0)
-    mul        t0, t0, t1    // z2
-    mul        t1, t2, t3    // z1
-    mul        t2, t4, t5    // z3
-    mul        t3, t6, t7    // z4
-    li         t4, 10703     // FIX(1.306562965)
-    li         t5, 4433      // FIX_0_541196100
-    li         t6, 7053      // FIX(0.860918669)
-    mul        t4, t0,t4     // tmp11
-    mul        t5, t0,t5     // -tmp14
-    addu       t7, t1,t2     // tmp10
-    addu       t8, t7,t3     // tmp10 + z4
-    mul        t6, t6, t8    // tmp15
-    li         t8, 2139      // FIX(0.261052384)
-    mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
-    li         t7, 2295      // FIX(0.280143716)
-    mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
-    addu       t9, t2, t3    // z3 + z4
-    li         s0, 8565      // FIX(1.045510580)
-    mul        t9, t9, s0    // -tmp13
-    li         s0, 12112     // FIX(1.478575242)
-    mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
-    li         s1, 12998     // FIX(1.586706681)
-    mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
-    li         s2, 5540      // FIX(0.676326758)
-    mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
-    li         s3, 16244     // FIX(1.982889723)
-    mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
-    subu       t1, t1, t3    // z1-=z4
-    subu       t0, t0, t2    // z2-=z3
-    addu       t2, t0, t1    // z1+z2
-    li         t3, 4433      // FIX_0_541196100
-    mul        t2, t2, t3    // z3
-    li         t3, 6270      // FIX_0_765366865
-    mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
-    li         t3, 15137     // FIX_0_765366865
-    mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
-    addu       t8, t6, t8    // tmp12
-    addu       t3, t8, t4    // tmp12 + tmp11
-    addu       t3, t3, t7    // tmp10
-    subu       t8, t8, t9    // tmp12 + tmp13
-    addu       s0, t5, s0
-    subu       t8, t8, s0    // tmp12
-    subu       t9, t6, t9
-    subu       s1, s1, t4
-    addu       t9, t9, s1    // tmp13
-    subu       t6, t6, t5
-    subu       t6, t6, s2
-    subu       t6, t6, s3    // tmp15
-    // even part start
-    lh         t4, 64(a1)
-    lh         t5, 64(a0)
-    lh         t7, 32(a1)
-    lh         s0, 32(a0)
-    lh         s1, 0(a1)
-    lh         s2, 0(a0)
-    lh         s3, 96(a1)
-    lh         v0, 96(a0)
-    mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
-    mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
-    mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
-    mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
-    // odd part end
-    addu       t1, t2, t1    // tmp11
-    subu       t0, t2, t0    // tmp14
-    // update counter and pointers
-    addiu      a3, a3, -1
-    addiu      a0, a0, 2
-    addiu      a1, a1, 2
-    // even part rest
-    li         s1, 10033
-    li         s2, 11190
-    mul        t4, t4, s1    // z4
-    mul        s1, t5, s2    // z4
-    sll        t5, t5, 13    // z1
-    sll        t7, t7, 13
-    addiu      t7, t7, 1024  // z3
-    sll        s0, s0, 13    // z2
-    addu       s2, t7, t4    // tmp10
-    subu       t4, t7, t4    // tmp11
-    subu       s3, t5, s0    // tmp12
-    addu       t2, t7, s3    // tmp21
-    subu       s3, t7, s3    // tmp24
-    addu       t7, s1, s0    // tmp12
-    addu       v0, s2, t7    // tmp20
-    subu       s2, s2, t7    // tmp25
-    subu       s1, s1, t5    // z4 - z1
-    subu       s1, s1, s0    // tmp12
-    addu       s0, t4, s1    // tmp22
-    subu       t4, t4, s1    // tmp23
-    // final output stage
-    addu       t5, v0, t3
-    subu       v0, v0, t3
-    addu       t3, t2, t1
-    subu       t2, t2, t1
-    addu       t1, s0, t8
-    subu       s0, s0, t8
-    addu       t8, t4, t9
-    subu       t4, t4, t9
-    addu       t9, s3, t0
-    subu       s3, s3, t0
-    addu       t0, s2, t6
-    subu       s2, s2, t6
-    sra        t5, t5, 11
-    sra        t3, t3, 11
-    sra        t1, t1, 11
-    sra        t8, t8, 11
-    sra        t9, t9, 11
-    sra        t0, t0, 11
-    sra        s2, s2, 11
-    sra        s3, s3, 11
-    sra        t4, t4, 11
-    sra        s0, s0, 11
-    sra        t2, t2, 11
-    sra        v0, v0, 11
-    sw         t5, 0(a2)
-    sw         t3, 32(a2)
-    sw         t1, 64(a2)
-    sw         t8, 96(a2)
-    sw         t9, 128(a2)
-    sw         t0, 160(a2)
-    sw         s2, 192(a2)
-    sw         s3, 224(a2)
-    sw         t4, 256(a2)
-    sw         s0, 288(a2)
-    sw         t2, 320(a2)
-    sw         v0, 352(a2)
-    bgtz       a3, 1b
-     addiu     a2, a2, 4
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    j          ra
-     nop
-
-END(jsimd_idct_12x12_pass1_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
-/*
- * a0     - workspace
- * a1     - output
- */
-
-    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
-
-    li        a3, 12
-
-1:
-    // Odd part
-    lw        t0, 12(a0)
-    lw        t1, 4(a0)
-    lw        t2, 20(a0)
-    lw        t3, 28(a0)
-    li        t4, 10703     // FIX(1.306562965)
-    li        t5, 4433      // FIX_0_541196100
-    mul       t4, t0, t4    // tmp11
-    mul       t5, t0, t5    // -tmp14
-    addu      t6, t1, t2    // tmp10
-    li        t7, 2139      // FIX(0.261052384)
-    mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
-    addu      t6, t6, t3    // tmp10 + z4
-    li        t8, 7053      // FIX(0.860918669)
-    mul       t6, t6, t8    // tmp15
-    li        t8, 2295      // FIX(0.280143716)
-    mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
-    addu      t9, t2, t3    // z3 + z4
-    li        s0, 8565      // FIX(1.045510580)
-    mul       t9, t9, s0    // -tmp13
-    li        s0, 12112     // FIX(1.478575242)
-    mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
-    li        s1, 12998     // FIX(1.586706681)
-    mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
-    li        s2, 5540      // FIX(0.676326758)
-    mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
-    li        s3, 16244     // FIX(1.982889723)
-    mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
-    subu      t1, t1, t3    // z1 -= z4
-    subu      t0, t0, t2    // z2 -= z3
-    addu      t2, t1, t0    // z1 + z2
-    li        t3, 4433      // FIX_0_541196100
-    mul       t2, t2, t3    // z3
-    li        t3, 6270      // FIX_0_765366865
-    mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
-    li        t3, 15137     // FIX_1_847759065
-    mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
-    addu      t3, t6, t7    // tmp12
-    addu      t7, t3, t4
-    addu      t7, t7, t8    // tmp10
-    subu      t3, t3, t9
-    subu      t3, t3, t5
-    subu      t3, t3, s0    // tmp12
-    subu      t9, t6, t9
-    subu      t9, t9, t4
-    addu      t9, t9, s1    // tmp13
-    subu      t6, t6, t5
-    subu      t6, t6, s2
-    subu      t6, t6, s3    // tmp15
-    addu      t1, t2, t1    // tmp11
-    subu      t0, t2, t0    // tmp14
-    // even part
-    lw        t2, 16(a0)    // z4
-    lw        t4, 8(a0)     // z1
-    lw        t5, 0(a0)     // z3
-    lw        t8, 24(a0)    // z2
-    li        s0, 10033     // FIX(1.224744871)
-    li        s1, 11190     // FIX(1.366025404)
-    mul       t2, t2, s0    // z4
-    mul       s0, t4, s1    // z4
-    addiu     t5, t5, 0x10
-    sll       t5, t5, 13    // z3
-    sll       t4, t4, 13    // z1
-    sll       t8, t8, 13    // z2
-    subu      s1, t4, t8    // tmp12
-    addu      s2, t5, t2    // tmp10
-    subu      t2, t5, t2    // tmp11
-    addu      s3, t5, s1    // tmp21
-    subu      s1, t5, s1    // tmp24
-    addu      t5, s0, t8    // tmp12
-    addu      v0, s2, t5    // tmp20
-    subu      t5, s2, t5    // tmp25
-    subu      t4, s0, t4
-    subu      t4, t4, t8    // tmp12
-    addu      t8, t2, t4    // tmp22
-    subu      t2, t2, t4    // tmp23
-    // increment counter and pointers
-    addiu     a3, a3, -1
-    addiu     a0, a0, 32
-    // Final stage
-    addu      t4, v0, t7
-    subu      v0, v0, t7
-    addu      t7, s3, t1
-    subu      s3, s3, t1
-    addu      t1, t8, t3
-    subu      t8, t8, t3
-    addu      t3, t2, t9
-    subu      t2, t2, t9
-    addu      t9, s1, t0
-    subu      s1, s1, t0
-    addu      t0, t5, t6
-    subu      t5, t5, t6
-    sll       t4, t4, 4
-    sll       t7, t7, 4
-    sll       t1, t1, 4
-    sll       t3, t3, 4
-    sll       t9, t9, 4
-    sll       t0, t0, 4
-    sll       t5, t5, 4
-    sll       s1, s1, 4
-    sll       t2, t2, 4
-    sll       t8, t8, 4
-    sll       s3, s3, 4
-    sll       v0, v0, 4
-    shll_s.w  t4, t4, 2
-    shll_s.w  t7, t7, 2
-    shll_s.w  t1, t1, 2
-    shll_s.w  t3, t3, 2
-    shll_s.w  t9, t9, 2
-    shll_s.w  t0, t0, 2
-    shll_s.w  t5, t5, 2
-    shll_s.w  s1, s1, 2
-    shll_s.w  t2, t2, 2
-    shll_s.w  t8, t8, 2
-    shll_s.w  s3, s3, 2
-    shll_s.w  v0, v0, 2
-    srl       t4, t4, 24
-    srl       t7, t7, 24
-    srl       t1, t1, 24
-    srl       t3, t3, 24
-    srl       t9, t9, 24
-    srl       t0, t0, 24
-    srl       t5, t5, 24
-    srl       s1, s1, 24
-    srl       t2, t2, 24
-    srl       t8, t8, 24
-    srl       s3, s3, 24
-    srl       v0, v0, 24
-    lw        t6, 0(a1)
-    addiu     t4, t4, 0x80
-    addiu     t7, t7, 0x80
-    addiu     t1, t1, 0x80
-    addiu     t3, t3, 0x80
-    addiu     t9, t9, 0x80
-    addiu     t0, t0, 0x80
-    addiu     t5, t5, 0x80
-    addiu     s1, s1, 0x80
-    addiu     t2, t2, 0x80
-    addiu     t8, t8, 0x80
-    addiu     s3, s3, 0x80
-    addiu     v0, v0, 0x80
-    sb        t4, 0(t6)
-    sb        t7, 1(t6)
-    sb        t1, 2(t6)
-    sb        t3, 3(t6)
-    sb        t9, 4(t6)
-    sb        t0, 5(t6)
-    sb        t5, 6(t6)
-    sb        s1, 7(t6)
-    sb        t2, 8(t6)
-    sb        t8, 9(t6)
-    sb        s3, 10(t6)
-    sb        v0, 11(t6)
-    bgtz      a3, 1b
-     addiu    a1, a1, 4
-
-    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
-
-    jr        ra
-     nop
-
-END(jsimd_idct_12x12_pass2_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
-/*
- * a0     - sample_data
- * a1     - start_col
- * a2     - workspace
- */
-
-    lw             t0, 0(a0)
-    li             t7, 0xff80ff80
-    addu           t0, t0, a1
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    lw             t0, 4(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 0(a2)
-    usw            t4, 4(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 8(a2)
-    usw            t6, 12(a2)
-
-    lw             t0, 8(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 16(a2)
-    usw            t4, 20(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 24(a2)
-    usw            t6, 28(a2)
-
-    lw             t0, 12(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 32(a2)
-    usw            t4, 36(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 40(a2)
-    usw            t6, 44(a2)
-
-    lw             t0, 16(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 48(a2)
-    usw            t4, 52(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 56(a2)
-    usw            t6, 60(a2)
-
-    lw             t0, 20(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 64(a2)
-    usw            t4, 68(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 72(a2)
-    usw            t6, 76(a2)
-
-    lw             t0, 24(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 80(a2)
-    usw            t4, 84(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 88(a2)
-    usw            t6, 92(a2)
-
-    lw             t0, 28(a0)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu           t0, t0, a1
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    ulw            t1, 0(t0)
-    ulw            t2, 4(t0)
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 96(a2)
-    usw            t4, 100(a2)
-    preceu.ph.qbr  t3, t1
-    preceu.ph.qbl  t4, t1
-    usw            t5, 104(a2)
-    usw            t6, 108(a2)
-    preceu.ph.qbr  t5, t2
-    preceu.ph.qbl  t6, t2
-    addu.ph        t3, t3, t7
-    addu.ph        t4, t4, t7
-    addu.ph        t5, t5, t7
-    addu.ph        t6, t6, t7
-    usw            t3, 112(a2)
-    usw            t4, 116(a2)
-    usw            t5, 120(a2)
-    usw            t6, 124(a2)
-
-    j              ra
-     nop
-
-END(jsimd_convsamp_mips_dspr2)
-
-/*****************************************************************************/
-LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
-/*
- * a0     - sample_data
- * a1     - start_col
- * a2     - workspace
- */
-
-    .set at
-
-    lw       t0, 0(a0)
-    addu     t0, t0, a1
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 4(a0)
-    swc1     f2, 0(a2)
-    swc1     f4, 4(a2)
-    swc1     f6, 8(a2)
-    addu     t0, t0, a1
-    swc1     f8, 12(a2)
-    swc1     f10, 16(a2)
-    swc1     f12, 20(a2)
-    swc1     f14, 24(a2)
-    swc1     f16, 28(a2)
-    //elemr 1
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 8(a0)
-    swc1     f2, 32(a2)
-    swc1     f4, 36(a2)
-    swc1     f6, 40(a2)
-    addu     t0, t0, a1
-    swc1     f8, 44(a2)
-    swc1     f10, 48(a2)
-    swc1     f12, 52(a2)
-    swc1     f14, 56(a2)
-    swc1     f16, 60(a2)
-    //elemr 2
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 12(a0)
-    swc1     f2, 64(a2)
-    swc1     f4, 68(a2)
-    swc1     f6, 72(a2)
-    addu     t0, t0, a1
-    swc1     f8, 76(a2)
-    swc1     f10, 80(a2)
-    swc1     f12, 84(a2)
-    swc1     f14, 88(a2)
-    swc1     f16, 92(a2)
-    //elemr 3
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 16(a0)
-    swc1     f2, 96(a2)
-    swc1     f4, 100(a2)
-    swc1     f6, 104(a2)
-    addu     t0, t0, a1
-    swc1     f8, 108(a2)
-    swc1     f10, 112(a2)
-    swc1     f12, 116(a2)
-    swc1     f14, 120(a2)
-    swc1     f16, 124(a2)
-    //elemr 4
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 20(a0)
-    swc1     f2, 128(a2)
-    swc1     f4, 132(a2)
-    swc1     f6, 136(a2)
-    addu     t0, t0, a1
-    swc1     f8, 140(a2)
-    swc1     f10, 144(a2)
-    swc1     f12, 148(a2)
-    swc1     f14, 152(a2)
-    swc1     f16, 156(a2)
-    //elemr 5
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 24(a0)
-    swc1     f2, 160(a2)
-    swc1     f4, 164(a2)
-    swc1     f6, 168(a2)
-    addu     t0, t0, a1
-    swc1     f8, 172(a2)
-    swc1     f10, 176(a2)
-    swc1     f12, 180(a2)
-    swc1     f14, 184(a2)
-    swc1     f16, 188(a2)
-    //elemr 6
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    lw       t0, 28(a0)
-    swc1     f2, 192(a2)
-    swc1     f4, 196(a2)
-    swc1     f6, 200(a2)
-    addu     t0, t0, a1
-    swc1     f8, 204(a2)
-    swc1     f10, 208(a2)
-    swc1     f12, 212(a2)
-    swc1     f14, 216(a2)
-    swc1     f16, 220(a2)
-    //elemr 7
-    lbu      t1, 0(t0)
-    lbu      t2, 1(t0)
-    lbu      t3, 2(t0)
-    lbu      t4, 3(t0)
-    lbu      t5, 4(t0)
-    lbu      t6, 5(t0)
-    lbu      t7, 6(t0)
-    lbu      t8, 7(t0)
-    addiu    t1, t1, -128
-    addiu    t2, t2, -128
-    addiu    t3, t3, -128
-    addiu    t4, t4, -128
-    addiu    t5, t5, -128
-    addiu    t6, t6, -128
-    addiu    t7, t7, -128
-    addiu    t8, t8, -128
-    mtc1     t1, f2
-    mtc1     t2, f4
-    mtc1     t3, f6
-    mtc1     t4, f8
-    mtc1     t5, f10
-    mtc1     t6, f12
-    mtc1     t7, f14
-    mtc1     t8, f16
-    cvt.s.w  f2, f2
-    cvt.s.w  f4, f4
-    cvt.s.w  f6, f6
-    cvt.s.w  f8, f8
-    cvt.s.w  f10, f10
-    cvt.s.w  f12, f12
-    cvt.s.w  f14, f14
-    cvt.s.w  f16, f16
-    swc1     f2, 224(a2)
-    swc1     f4, 228(a2)
-    swc1     f6, 232(a2)
-    swc1     f8, 236(a2)
-    swc1     f10, 240(a2)
-    swc1     f12, 244(a2)
-    swc1     f14, 248(a2)
-    swc1     f16, 252(a2)
-
-    j        ra
-     nop
-
-END(jsimd_convsamp_float_mips_dspr2)
-
-/*****************************************************************************/
-
diff --git a/media/libjpeg/simd/jsimd_mips_dspr2_asm.h b/media/libjpeg/simd/jsimd_mips_dspr2_asm.h
deleted file mode 100644
index 64f9880482..0000000000
--- a/media/libjpeg/simd/jsimd_mips_dspr2_asm.h
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * MIPS DSPr2 optimizations for libjpeg-turbo
- *
- * Copyright (C) 2013, MIPS Technologies, Inc., California.
- * All Rights Reserved.
- * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
- *           Darko Laus       (darko.laus@imgtec.com)
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#define zero $0
-#define AT   $1
-#define v0   $2
-#define v1   $3
-#define a0   $4
-#define a1   $5
-#define a2   $6
-#define a3   $7
-#define t0   $8
-#define t1   $9
-#define t2   $10
-#define t3   $11
-#define t4   $12
-#define t5   $13
-#define t6   $14
-#define t7   $15
-#define s0   $16
-#define s1   $17
-#define s2   $18
-#define s3   $19
-#define s4   $20
-#define s5   $21
-#define s6   $22
-#define s7   $23
-#define t8   $24
-#define t9   $25
-#define k0   $26
-#define k1   $27
-#define gp   $28
-#define sp   $29
-#define fp   $30
-#define s8   $30
-#define ra   $31
-
-#define f0   $f0
-#define f1   $f1
-#define f2   $f2
-#define f3   $f3
-#define f4   $f4
-#define f5   $f5
-#define f6   $f6
-#define f7   $f7
-#define f8   $f8
-#define f9   $f9
-#define f10  $f10
-#define f11  $f11
-#define f12  $f12
-#define f13  $f13
-#define f14  $f14
-#define f15  $f15
-#define f16  $f16
-#define f17  $f17
-#define f18  $f18
-#define f19  $f19
-#define f20  $f20
-#define f21  $f21
-#define f22  $f22
-#define f23  $f23
-#define f24  $f24
-#define f25  $f25
-#define f26  $f26
-#define f27  $f27
-#define f28  $f28
-#define f29  $f29
-#define f30  $f30
-#define f31  $f31
-
-/*
- * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
- */
-#define LEAF_MIPS32R2(symbol)                           \
-                .globl  symbol;                         \
-                .align  2;                              \
-                .type   symbol, @function;              \
-                .ent    symbol, 0;                      \
-symbol:         .frame  sp, 0, ra;                      \
-                .set    push;                           \
-                .set    arch=mips32r2;                  \
-                .set    noreorder;                      \
-                .set    noat;
-
-/*
- * LEAF_MIPS_DSPR2 - declare leaf routine for MIPS DSPr2
- */
-#define LEAF_MIPS_DSPR2(symbol)                         \
-LEAF_MIPS32R2(symbol)                                   \
-                .set    dspr2;
-
-/*
- * END - mark end of function
- */
-#define END(function)                                   \
-                .set    pop;                            \
-                .end    function;                       \
-                .size   function,.-function
-
-/*
- * Checks if stack offset is big enough for storing/restoring regs_num
- * number of register to/from stack. Stack offset must be greater than
- * or equal to the number of bytes needed for storing registers (regs_num*4).
- * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
- * preserved for input arguments of the functions, already stored in a0-a3),
- * stack size can be further optimized by utilizing this space.
- */
-.macro CHECK_STACK_OFFSET regs_num, stack_offset
-.if \stack_offset < \regs_num * 4 - 16
-.error "Stack offset too small."
-.endif
-.endm
-
-/*
- * Saves set of registers on stack. Maximum number of registers that
- * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * before registers are pushed in order to provide enough space on stack
- * (offset must be multiple of 4, and must be big enough, as described by
- * CHECK_STACK_OFFSET macro). This macro is intended to be used in
- * combination with RESTORE_REGS_FROM_STACK macro. Example:
- *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
- *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
-                          r2  = 0, r3  = 0, r4  = 0, \
-                          r5  = 0, r6  = 0, r7  = 0, \
-                          r8  = 0, r9  = 0, r10 = 0, \
-                          r11 = 0, r12 = 0, r13 = 0, \
-                          r14 = 0
-    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
-    .error "Stack offset must be pozitive and multiple of 4."
-    .endif
-    .if \stack_offset != 0
-    addiu           sp, sp, -\stack_offset
-    .endif
-    sw              \r1, 0(sp)
-    .if \r2 != 0
-    sw              \r2, 4(sp)
-    .endif
-    .if \r3 != 0
-    sw              \r3, 8(sp)
-    .endif
-    .if \r4 != 0
-    sw              \r4, 12(sp)
-    .endif
-    .if \r5 != 0
-    CHECK_STACK_OFFSET 5, \stack_offset
-    sw              \r5, 16(sp)
-    .endif
-    .if \r6 != 0
-    CHECK_STACK_OFFSET 6, \stack_offset
-    sw              \r6, 20(sp)
-    .endif
-    .if \r7 != 0
-    CHECK_STACK_OFFSET 7, \stack_offset
-    sw              \r7, 24(sp)
-    .endif
-    .if \r8 != 0
-    CHECK_STACK_OFFSET 8, \stack_offset
-    sw              \r8, 28(sp)
-    .endif
-    .if \r9 != 0
-    CHECK_STACK_OFFSET 9, \stack_offset
-    sw              \r9, 32(sp)
-    .endif
-    .if \r10 != 0
-    CHECK_STACK_OFFSET 10, \stack_offset
-    sw              \r10, 36(sp)
-    .endif
-    .if \r11 != 0
-    CHECK_STACK_OFFSET 11, \stack_offset
-    sw              \r11, 40(sp)
-    .endif
-    .if \r12 != 0
-    CHECK_STACK_OFFSET 12, \stack_offset
-    sw              \r12, 44(sp)
-    .endif
-    .if \r13 != 0
-    CHECK_STACK_OFFSET 13, \stack_offset
-    sw              \r13, 48(sp)
-    .endif
-    .if \r14 != 0
-    CHECK_STACK_OFFSET 14, \stack_offset
-    sw              \r14, 52(sp)
-    .endif
-.endm
-
-/*
- * Restores set of registers from stack. Maximum number of registers that
- * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
- * Stack offset is number of bytes that are added to stack pointer (sp)
- * after registers are restored (offset must be multiple of 4, and must
- * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
- * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
- * Example:
- *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
- *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
- */
-.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
-                               r2  = 0, r3  = 0, r4  = 0, \
-                               r5  = 0, r6  = 0, r7  = 0, \
-                               r8  = 0, r9  = 0, r10 = 0, \
-                               r11 = 0, r12 = 0, r13 = 0, \
-                               r14 = 0
-    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
-    .error "Stack offset must be pozitive and multiple of 4."
-    .endif
-    lw              \r1, 0(sp)
-    .if \r2 != 0
-    lw              \r2, 4(sp)
-    .endif
-    .if \r3 != 0
-    lw              \r3, 8(sp)
-    .endif
-    .if \r4 != 0
-    lw              \r4, 12(sp)
-    .endif
-    .if \r5 != 0
-    CHECK_STACK_OFFSET 5, \stack_offset
-    lw              \r5, 16(sp)
-    .endif
-    .if \r6 != 0
-    CHECK_STACK_OFFSET 6, \stack_offset
-    lw              \r6, 20(sp)
-    .endif
-    .if \r7 != 0
-    CHECK_STACK_OFFSET 7, \stack_offset
-    lw              \r7, 24(sp)
-    .endif
-    .if \r8 != 0
-    CHECK_STACK_OFFSET 8, \stack_offset
-    lw              \r8, 28(sp)
-    .endif
-    .if \r9 != 0
-    CHECK_STACK_OFFSET 9, \stack_offset
-    lw              \r9, 32(sp)
-    .endif
-    .if \r10 != 0
-    CHECK_STACK_OFFSET 10, \stack_offset
-    lw              \r10, 36(sp)
-    .endif
-    .if \r11 != 0
-    CHECK_STACK_OFFSET 11, \stack_offset
-    lw              \r11, 40(sp)
-    .endif
-    .if \r12 != 0
-    CHECK_STACK_OFFSET 12, \stack_offset
-    lw              \r12, 44(sp)
-    .endif
-    .if \r13 != 0
-    CHECK_STACK_OFFSET 13, \stack_offset
-    lw              \r13, 48(sp)
-    .endif
-    .if \r14 != 0
-    CHECK_STACK_OFFSET 14, \stack_offset
-    lw              \r14, 52(sp)
-    .endif
-    .if \stack_offset != 0
-    addiu           sp, sp, \stack_offset
-    .endif
-.endm
-
-
diff --git a/media/libjpeg/simd/jsimd_powerpc.c b/media/libjpeg/simd/jsimd_powerpc.c
deleted file mode 100644
index 42dc1e0868..0000000000
--- a/media/libjpeg/simd/jsimd_powerpc.c
+++ /dev/null
@@ -1,828 +0,0 @@
-/*
- * jsimd_powerpc.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * PowerPC architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <ctype.h>
-
-static unsigned int simd_support = ~0;
-
-#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-
-#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
-
-LOCAL(int)
-check_feature (char *buffer, char *feature)
-{
-  char *p;
-  if (*feature == 0)
-    return 0;
-  if (strncmp(buffer, "cpu", 3) != 0)
-    return 0;
-  buffer += 3;
-  while (isspace(*buffer))
-    buffer++;
-
-  /* Check if 'feature' is present in the buffer as a separate word */
-  while ((p = strstr(buffer, feature))) {
-    if (p > buffer && !isspace(*(p - 1))) {
-      buffer++;
-      continue;
-    }
-    p += strlen(feature);
-    if (*p != 0 && !isspace(*p)) {
-      buffer++;
-      continue;
-    }
-    return 1;
-  }
-  return 0;
-}
-
-LOCAL(int)
-parse_proc_cpuinfo (int bufsize)
-{
-  char *buffer = (char *)malloc(bufsize);
-  FILE *fd;
-  simd_support = 0;
-
-  if (!buffer)
-    return 0;
-
-  fd = fopen("/proc/cpuinfo", "r");
-  if (fd) {
-    while (fgets(buffer, bufsize, fd)) {
-      if (!strchr(buffer, '\n') && !feof(fd)) {
-        /* "impossible" happened - insufficient size of the buffer! */
-        fclose(fd);
-        free(buffer);
-        return 0;
-      }
-      if (check_feature(buffer, "altivec"))
-        simd_support |= JSIMD_ALTIVEC;
-    }
-    fclose(fd);
-  }
-  free(buffer);
-  return 1;
-}
-
-#endif
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
-  int bufsize = 1024; /* an initial guess for the line buffer size limit */
-#endif
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = 0;
-
-#if defined(__ALTIVEC__) || defined(__APPLE__)
-  simd_support |= JSIMD_ALTIVEC;
-#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
-  while (!parse_proc_cpuinfo(bufsize)) {
-    bufsize *= 2;
-    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
-      break;
-  }
-#endif
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCEALTIVEC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = JSIMD_ALTIVEC;
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_extrgb_ycc_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_extrgbx_ycc_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_extbgr_ycc_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_extbgrx_ycc_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_extxbgr_ycc_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_extxrgb_ycc_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_rgb_ycc_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_extrgb_gray_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_extrgbx_gray_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_extbgr_gray_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_extbgrx_gray_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_extxbgr_gray_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_extxrgb_gray_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_rgb_gray_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_ycc_extrgb_convert_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_ycc_extrgbx_convert_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_ycc_extbgr_convert_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_ycc_extbgrx_convert_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_ycc_extxbgr_convert_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_ycc_extxrgb_convert_altivec;
-      break;
-    default:
-      altivecfct=jsimd_ycc_rgb_convert_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
-                                compptr->v_samp_factor,
-                                compptr->width_in_blocks,
-                                input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
-                                compptr->v_samp_factor,
-                                compptr->width_in_blocks,
-                                input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
-                              input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
-                              input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
-                                    compptr->downsampled_width, input_data,
-                                    output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
-                                    compptr->downsampled_width, input_data,
-                                    output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_h2v2_extrgb_merged_upsample_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_h2v2_extrgbx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_h2v2_extbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_h2v2_extbgrx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_h2v2_extxbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_h2v2_extxrgb_merged_upsample_altivec;
-      break;
-    default:
-      altivecfct=jsimd_h2v2_merged_upsample_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*altivecfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      altivecfct=jsimd_h2v1_extrgb_merged_upsample_altivec;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      altivecfct=jsimd_h2v1_extrgbx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGR:
-      altivecfct=jsimd_h2v1_extbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      altivecfct=jsimd_h2v1_extbgrx_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      altivecfct=jsimd_h2v1_extxbgr_merged_upsample_altivec;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      altivecfct=jsimd_h2v1_extxrgb_merged_upsample_altivec;
-      break;
-    default:
-      altivecfct=jsimd_h2v1_merged_upsample_altivec;
-      break;
-  }
-
-  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_altivec(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_altivec(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_altivec(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_ALTIVEC)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
-                           output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return NULL;
-}
diff --git a/media/libjpeg/simd/jsimd_x86_64.c b/media/libjpeg/simd/jsimd_x86_64.c
deleted file mode 100644
index a62bcdb0ee..0000000000
--- a/media/libjpeg/simd/jsimd_x86_64.c
+++ /dev/null
@@ -1,887 +0,0 @@
-/*
- * jsimd_x86_64.c
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2009-2011, 2014, 2016, D. R. Commander.
- * Copyright (C) 2015, Matthieu Darbois.
- *
- * Based on the x86 SIMD extension for IJG JPEG library,
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- * For conditions of distribution and use, see copyright notice in jsimdext.inc
- *
- * This file contains the interface between the "normal" portions
- * of the library and the SIMD implementations when running on a
- * 64-bit x86 architecture.
- */
-
-#define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
-#include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
-
-/*
- * In the PIC cases, we have no guarantee that constants will keep
- * their alignment. This macro allows us to verify it at runtime.
- */
-#define IS_ALIGNED(ptr, order) (((size_t)ptr & ((1 << order) - 1)) == 0)
-
-#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
-
-static unsigned int simd_support = ~0;
-static unsigned int simd_huffman = 1;
-
-/*
- * Check what SIMD accelerations are supported.
- *
- * FIXME: This code is racy under a multi-threaded environment.
- */
-LOCAL(void)
-init_simd (void)
-{
-  char *env = NULL;
-
-  if (simd_support != ~0U)
-    return;
-
-  simd_support = JSIMD_SSE2 | JSIMD_SSE;
-
-  /* Force different settings through environment variables */
-  env = getenv("JSIMD_FORCENONE");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_support = 0;
-  env = getenv("JSIMD_NOHUFFENC");
-  if ((env != NULL) && (strcmp(env, "1") == 0))
-    simd_huffman = 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_ycc (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_rgb_gray (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_ycc_rgb565 (void)
-{
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
-                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                       JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_ycc_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_ycc_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_rgb_ycc_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_rgb_gray_convert (j_compress_ptr cinfo,
-                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
-                        JDIMENSION output_row, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
-
-  switch(cinfo->in_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_extrgb_gray_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_extrgbx_gray_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_extbgr_gray_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_extbgrx_gray_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_extxbgr_gray_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_extxrgb_gray_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_rgb_gray_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
-                       JSAMPIMAGE input_buf, JDIMENSION input_row,
-                       JSAMPARRAY output_buf, int num_rows)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_ycc_extrgb_convert_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_ycc_extbgr_convert_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
-      break;
-    default:
-      sse2fct=jsimd_ycc_rgb_convert_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
-}
-
-GLOBAL(void)
-jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
-                          JSAMPIMAGE input_buf, JDIMENSION input_row,
-                          JSAMPARRAY output_buf, int num_rows)
-{
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_downsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(void)
-jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
-                       JSAMPARRAY input_data, JSAMPARRAY output_data)
-{
-  jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
-                             compptr->v_samp_factor, compptr->width_in_blocks,
-                             input_data, output_data);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                           input_data, output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_upsample (j_decompress_ptr cinfo,
-                     jpeg_component_info *compptr,
-                     JSAMPARRAY input_data,
-                     JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
-                           input_data, output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_fancy_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(void)
-jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
-                           jpeg_component_info *compptr,
-                           JSAMPARRAY input_data,
-                           JSAMPARRAY *output_data_ptr)
-{
-  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
-                                 compptr->downsampled_width, input_data,
-                                 output_data_ptr);
-}
-
-GLOBAL(int)
-jsimd_can_h2v2_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_h2v1_merged_upsample (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) &&
-      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
-      break;
-    default:
-      sse2fct=jsimd_h2v2_merged_upsample_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(void)
-jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
-                            JSAMPIMAGE input_buf,
-                            JDIMENSION in_row_group_ctr,
-                            JSAMPARRAY output_buf)
-{
-  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
-
-  switch(cinfo->out_color_space) {
-    case JCS_EXT_RGB:
-      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
-      break;
-    case JCS_EXT_RGBX:
-    case JCS_EXT_RGBA:
-      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGR:
-      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_BGRX:
-    case JCS_EXT_BGRA:
-      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XBGR:
-    case JCS_EXT_ABGR:
-      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
-      break;
-    case JCS_EXT_XRGB:
-    case JCS_EXT_ARGB:
-      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
-      break;
-    default:
-      sse2fct=jsimd_h2v1_merged_upsample_sse2;
-      break;
-  }
-
-  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
-}
-
-GLOBAL(int)
-jsimd_can_convsamp (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_convsamp_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
-                DCTELEM *workspace)
-{
-  jsimd_convsamp_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(void)
-jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
-                      FAST_FLOAT *workspace)
-{
-  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_fdct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_fdct_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_fdct_islow (DCTELEM *data)
-{
-  jsimd_fdct_islow_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_ifast (DCTELEM *data)
-{
-  jsimd_fdct_ifast_sse2(data);
-}
-
-GLOBAL(void)
-jsimd_fdct_float (FAST_FLOAT *data)
-{
-  jsimd_fdct_float_sse(data);
-}
-
-GLOBAL(int)
-jsimd_can_quantize (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(DCTELEM) != 2)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_quantize_float (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-
-  if (simd_support & JSIMD_SSE2)
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
-                DCTELEM *workspace)
-{
-  jsimd_quantize_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(void)
-jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
-                      FAST_FLOAT *workspace)
-{
-  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
-}
-
-GLOBAL(int)
-jsimd_can_idct_2x2 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_4x4 (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                JDIMENSION output_col)
-{
-  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
-}
-
-GLOBAL(int)
-jsimd_can_idct_islow (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(ISLOW_MULT_TYPE) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_ifast (void)
-{
-  init_simd();
-
-  /* The code is optimised for these values only */
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(IFAST_MULT_TYPE) != 2)
-    return 0;
-  if (IFAST_SCALE_BITS != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(int)
-jsimd_can_idct_float (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-  if (BITS_IN_JSAMPLE != 8)
-    return 0;
-  if (sizeof(JDIMENSION) != 4)
-    return 0;
-  if (sizeof(FAST_FLOAT) != 4)
-    return 0;
-  if (sizeof(FLOAT_MULT_TYPE) != 4)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(void)
-jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(void)
-jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
-                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
-                  JDIMENSION output_col)
-{
-  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
-                        output_col);
-}
-
-GLOBAL(int)
-jsimd_can_huff_encode_one_block (void)
-{
-  init_simd();
-
-  if (DCTSIZE != 8)
-    return 0;
-  if (sizeof(JCOEF) != 2)
-    return 0;
-
-  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
-      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
-    return 1;
-
-  return 0;
-}
-
-GLOBAL(JOCTET*)
-jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
-                             int last_dc_val, c_derived_tbl *dctbl,
-                             c_derived_tbl *actbl)
-{
-  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
-                                          dctbl, actbl);
-}
diff --git a/media/libjpeg/simd/jsimdcpu.asm b/media/libjpeg/simd/jsimdcpu.asm
deleted file mode 100644
index 599083b182..0000000000
--- a/media/libjpeg/simd/jsimdcpu.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; jsimdcpu.asm - SIMD instruction support check
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-;
-; Based on the x86 SIMD extension for IJG JPEG library
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; This file should be assembled with NASM (Netwide Assembler),
-; can *not* be assembled with Microsoft's MASM or any compatible
-; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ or
-; http://sourceforge.net/project/showfiles.php?group_id=6208
-;
-; [TAB8]
-
-%include "jsimdext.inc"
-
-; --------------------------------------------------------------------------
-        SECTION SEG_TEXT
-        BITS    32
-;
-; Check if the CPU supports SIMD instructions
-;
-; GLOBAL(unsigned int)
-; jpeg_simd_cpu_support (void)
-;
-
-        align   16
-        global  EXTN(jpeg_simd_cpu_support)
-
-EXTN(jpeg_simd_cpu_support):
-        push    ebx
-;       push    ecx             ; need not be preserved
-;       push    edx             ; need not be preserved
-;       push    esi             ; unused
-        push    edi
-
-        xor     edi,edi                 ; simd support flag
-
-        pushfd
-        pop     eax
-        mov     edx,eax
-        xor     eax, 1<<21              ; flip ID bit in EFLAGS
-        push    eax
-        popfd
-        pushfd
-        pop     eax
-        xor     eax,edx
-        jz      short .return           ; CPUID is not supported
-
-        ; Check for MMX instruction support
-        xor     eax,eax
-        cpuid
-        test    eax,eax
-        jz      short .return
-
-        xor     eax,eax
-        inc     eax
-        cpuid
-        mov     eax,edx                 ; eax = Standard feature flags
-
-        test    eax, 1<<23              ; bit23:MMX
-        jz      short .no_mmx
-        or      edi, byte JSIMD_MMX
-.no_mmx:
-        test    eax, 1<<25              ; bit25:SSE
-        jz      short .no_sse
-        or      edi, byte JSIMD_SSE
-.no_sse:
-        test    eax, 1<<26              ; bit26:SSE2
-        jz      short .no_sse2
-        or      edi, byte JSIMD_SSE2
-.no_sse2:
-
-        ; Check for 3DNow! instruction support
-        mov     eax, 0x80000000
-        cpuid
-        cmp     eax, 0x80000000
-        jbe     short .return
-
-        mov     eax, 0x80000001
-        cpuid
-        mov     eax,edx                 ; eax = Extended feature flags
-
-        test    eax, 1<<31              ; bit31:3DNow!(vendor independent)
-        jz      short .no_3dnow
-        or      edi, byte JSIMD_3DNOW
-.no_3dnow:
-
-.return:
-        mov     eax,edi
-
-        pop     edi
-;       pop     esi             ; unused
-;       pop     edx             ; need not be preserved
-;       pop     ecx             ; need not be preserved
-        pop     ebx
-        ret
-
-; For some reason, the OS X linker does not honor the request to align the
-; segment unless we do this.
-        align   16
diff --git a/media/libjpeg/simd/jsimdext.inc b/media/libjpeg/simd/jsimdext.inc
deleted file mode 100644
index f28db60b57..0000000000
--- a/media/libjpeg/simd/jsimdext.inc
+++ /dev/null
@@ -1,375 +0,0 @@
-;
-; jsimdext.inc - common declarations
-;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, D. R. Commander.
-;
-; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
-;
-; Copyright (C) 1999-2006, MIYASAKA Masaru.
-;
-; This software is provided 'as-is', without any express or implied
-; warranty.  In no event will the authors be held liable for any damages
-; arising from the use of this software.
-;
-; Permission is granted to anyone to use this software for any purpose,
-; including commercial applications, and to alter it and redistribute it
-; freely, subject to the following restrictions:
-;
-; 1. The origin of this software must not be misrepresented; you must not
-;    claim that you wrote the original software. If you use this software
-;    in a product, an acknowledgment in the product documentation would be
-;    appreciated but is not required.
-; 2. Altered source versions must be plainly marked as such, and must not be
-;    misrepresented as being the original software.
-; 3. This notice may not be removed or altered from any source distribution.
-;
-; [TAB8]
-
-; ==========================================================================
-;  System-dependent configurations
-
-%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
-; * Microsoft Visual C++
-; * MinGW (Minimalist GNU for Windows)
-; * CygWin
-; * LCC-Win32
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT    .text  align=16
-%define SEG_CONST   .rdata align=16
-%else
-%define SEG_TEXT    .text  align=16 public use32 class=CODE
-%define SEG_CONST   .rdata align=16 public use32 class=CONST
-%endif
-
-%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
-; * Microsoft Visual C++
-
-; -- segment definition --
-;
-%ifdef __YASM_VER__
-%define SEG_TEXT    .text  align=16
-%define SEG_CONST   .rdata align=16
-%else
-%define SEG_TEXT    .text  align=16 public use64 class=CODE
-%define SEG_CONST   .rdata align=16 public use64 class=CONST
-%endif
-%define EXTN(name)  name                        ; foo() -> foo
-
-%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
-; * Borland C++ (Win32)
-
-; -- segment definition --
-;
-%define SEG_TEXT    _text  align=16 public use32 class=CODE
-%define SEG_CONST   _data  align=16 public use32 class=DATA
-
-%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
-; * Linux
-; * *BSD family Unix using elf format
-; * Unix System V, including Solaris x86, UnixWare and SCO Unix
-
-; mark stack as non-executable
-section .note.GNU-stack noalloc noexec nowrite progbits
-
-; -- segment definition --
-;
-%ifdef __x86_64__
-%define SEG_TEXT    .text   progbits align=16
-%define SEG_CONST   .rodata progbits align=16
-%else
-%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
-%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
-%endif
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_       ; ELF supports PIC
-%define EXTN(name)  name                        ; foo() -> foo
-
-%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
-; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
-; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text
-%define SEG_CONST   .data
-
-; To make the code position-independent, append -DPIC to the commandline
-;
-%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_      ; BSD-style a.out supports PIC
-
-%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
-; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text  ;align=16    ; nasm doesn't accept align=16. why?
-%define SEG_CONST   .rodata align=16
-
-; The generation of position-independent code (PIC) is the default on Darwin.
-;
-%define PIC
-%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
-
-%else           ; ----(Other case)----------------------
-
-; -- segment definition --
-;
-%define SEG_TEXT    .text
-%define SEG_CONST   .data
-
-%endif  ; ----------------------------------------------
-
-; ==========================================================================
-
-; --------------------------------------------------------------------------
-;  Common types
-;
-%ifdef __x86_64__
-%define POINTER                 qword           ; general pointer type
-%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
-%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%else
-%define POINTER                 dword           ; general pointer type
-%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
-%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%endif
-
-%define INT                     dword           ; signed integer type
-%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
-%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
-
-%define FP32                    dword           ; IEEE754 single
-%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
-%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
-
-%define MMWORD                  qword           ; int64  (MMX register)
-%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
-%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
-
-; NASM is buggy and doesn't properly handle operand sizes for SSE
-; instructions, so for now we have to define XMMWORD as blank.
-%define XMMWORD                                 ; int128 (SSE register)
-%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
-%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
-
-; Similar hacks for when we load a dword or MMWORD into an xmm# register
-%define XMM_DWORD
-%define XMM_MMWORD
-
-%define SIZEOF_BYTE             1               ; sizeof(BYTE)
-%define SIZEOF_WORD             2               ; sizeof(WORD)
-%define SIZEOF_DWORD            4               ; sizeof(DWORD)
-%define SIZEOF_QWORD            8               ; sizeof(QWORD)
-%define SIZEOF_OWORD            16              ; sizeof(OWORD)
-
-%define BYTE_BIT                8               ; CHAR_BIT in C
-%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
-%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
-%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
-%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
-
-; --------------------------------------------------------------------------
-;  External Symbol Name
-;
-%ifndef EXTN
-%define EXTN(name)   _ %+ name          ; foo() -> _foo
-%endif
-
-; --------------------------------------------------------------------------
-;  Macros for position-independent code (PIC) support
-;
-%ifndef GOT_SYMBOL
-%undef PIC
-%endif
-
-%ifdef PIC ; -------------------------------------------
-
-%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
-
-; At present, nasm doesn't seem to support PIC generation for Mach-O.
-; The PIC support code below is a little tricky.
-
-        SECTION SEG_CONST
-const_base:
-
-%define GOTOFF(got,sym) (got) + (sym) - const_base
-
-%imacro get_GOT 1
-        ; NOTE: this macro destroys ecx resister.
-        call    %%geteip
-        add     ecx, byte (%%ref - $)
-        jmp     short %%adjust
-%%geteip:
-        mov     ecx, POINTER [esp]
-        ret
-%%adjust:
-        push    ebp
-        xor     ebp,ebp         ; ebp = 0
-%ifidni %1,ebx  ; (%1 == ebx)
-        ; db 0x8D,0x9C + jmp near const_base =
-        ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
-        db      0x8D,0x9C               ; 8D,9C
-        jmp     near const_base         ; E9,(const_base-%%ref)
-%%ref:
-%else  ; (%1 != ebx)
-        ; db 0x8D,0x8C + jmp near const_base =
-        ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
-        db      0x8D,0x8C               ; 8D,8C
-        jmp     near const_base         ; E9,(const_base-%%ref)
-%%ref:  mov     %1, ecx
-%endif ; (%1 == ebx)
-        pop     ebp
-%endmacro
-
-%else   ; GOT_SYMBOL != _MACHO_PIC_ ----------------
-
-%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
-
-%imacro get_GOT 1
-        extern  GOT_SYMBOL
-        call    %%geteip
-        add     %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
-        jmp     short %%done
-%%geteip:
-        mov     %1, POINTER [esp]
-        ret
-%%done:
-%endmacro
-
-%endif  ; GOT_SYMBOL == _MACHO_PIC_ ----------------
-
-%imacro pushpic 1.nolist
-        push    %1
-%endmacro
-%imacro poppic  1.nolist
-        pop     %1
-%endmacro
-%imacro movpic  2.nolist
-        mov     %1,%2
-%endmacro
-
-%else   ; !PIC -----------------------------------------
-
-%define GOTOFF(got,sym) (sym)
-
-%imacro get_GOT 1.nolist
-%endmacro
-%imacro pushpic 1.nolist
-%endmacro
-%imacro poppic  1.nolist
-%endmacro
-%imacro movpic  2.nolist
-%endmacro
-
-%endif  ;  PIC -----------------------------------------
-
-; --------------------------------------------------------------------------
-;  Align the next instruction on {2,4,8,16,..}-byte boundary.
-;  ".balign n,,m" in GNU as
-;
-%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
-%define FILLB(b,n)  (($$-(b)) & ((n)-1))
-
-%imacro alignx 1-2.nolist 0xFFFF
-%%bs:   times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
-               db 0x90                               ; nop
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
-               db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
-               db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
-               db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
-               db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
-               db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
-               db 0x8B,0xED                          ; mov ebp,ebp
-        times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
-               db 0x90                               ; nop
-%endmacro
-
-; Align the next data on {2,4,8,16,..}-byte boundary.
-;
-%imacro alignz 1.nolist
-        align %1, db 0          ; filling zeros
-%endmacro
-
-%ifdef __x86_64__
-
-%ifdef WIN64
-
-%imacro collect_args 0
-        push r12
-        push r13
-        push r14
-        push r15
-        mov r10, rcx
-        mov r11, rdx
-        mov r12, r8
-        mov r13, r9
-        mov r14, [rax+48]
-        mov r15, [rax+56]
-        push rsi
-        push rdi
-        sub     rsp, SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp], xmm6
-        sub     rsp, SIZEOF_XMMWORD
-        movaps  XMMWORD [rsp], xmm7
-%endmacro
-
-%imacro uncollect_args 0
-        movaps  xmm7, XMMWORD [rsp]
-        add     rsp, SIZEOF_XMMWORD
-        movaps  xmm6, XMMWORD [rsp]
-        add     rsp, SIZEOF_XMMWORD
-        pop rdi
-        pop rsi
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-%endmacro
-
-%else
-
-%imacro collect_args 0
-        push r10
-        push r11
-        push r12
-        push r13
-        push r14
-        push r15
-        mov r10, rdi
-        mov r11, rsi
-        mov r12, rdx
-        mov r13, rcx
-        mov r14, r8
-        mov r15, r9
-%endmacro
-
-%imacro uncollect_args 0
-        pop r15
-        pop r14
-        pop r13
-        pop r12
-        pop r11
-        pop r10
-%endmacro
-
-%endif
-
-%endif
-
-; --------------------------------------------------------------------------
-;  Defines picked up from the C headers
-;
-%include "jsimdcfg.inc"
-
-; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/mips/jsimd.c b/media/libjpeg/simd/mips/jsimd.c
new file mode 100644
index 0000000000..d2546eed32
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd.c
@@ -0,0 +1,1147 @@
+/*
+ * jsimd_mips.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2020, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if !(defined(__mips_dsp) && (__mips_dsp_rev >= 2)) && defined(__linux__)
+
+LOCAL(void)
+parse_proc_cpuinfo(const char *search_string)
+{
+  const char *file_name = "/proc/cpuinfo";
+  char cpuinfo_line[256];
+  FILE *f = NULL;
+
+  simd_support = 0;
+
+  if ((f = fopen(file_name, "r")) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+      if (strstr(cpuinfo_line, search_string) != NULL) {
+        fclose(f);
+        simd_support |= JSIMD_DSPR2;
+        return;
+      }
+    }
+    fclose(f);
+  }
+  /* Did not find string in the proc file, or not Linux ELF. */
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+  simd_support |= JSIMD_DSPR2;
+#elif defined(__linux__)
+  /* We still have a chance to use MIPS DSPR2 regardless of globally used
+   * -mdspr2 options passed to gcc by performing runtime detection via
+   * /proc/cpuinfo parsing on linux */
+  parse_proc_cpuinfo("MIPS 74K");
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEDSPR2");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_DSPR2;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+static const int mips_idct_ifast_coefs[4] = {
+  0x45404540,           /* FIX( 1.082392200 / 2) =  17734 = 0x4546 */
+  0x5A805A80,           /* FIX( 1.414213562 / 2) =  23170 = 0x5A82 */
+  0x76407640,           /* FIX( 1.847759065 / 2) =  30274 = 0x7642 */
+  0xAC60AC60            /* FIX(-2.613125930 / 4) = -21407 = 0xAC61 */
+};
+
+/* The following struct is borrowed from jdsample.c */
+typedef void (*upsample1_ptr) (j_decompress_ptr cinfo,
+                               jpeg_component_info *compptr,
+                               JSAMPARRAY input_data,
+                               JSAMPARRAY *output_data_ptr);
+typedef struct {
+  struct jpeg_upsampler pub;
+  JSAMPARRAY color_buf[MAX_COMPONENTS];
+  upsample1_ptr methods[MAX_COMPONENTS];
+  int next_row_out;
+  JDIMENSION rows_to_go;
+  int rowgroup_height[MAX_COMPONENTS];
+  UINT8 h_expand[MAX_COMPONENTS];
+  UINT8 v_expand[MAX_COMPONENTS];
+} my_upsampler;
+
+typedef my_upsampler *my_upsample_ptr;
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_ycc_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_ycc_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_ycc_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_extrgbx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_extbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_extbgrx_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_extxbgr_gray_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_extxrgb_gray_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_extrgb_gray_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_ycc_extrgbx_convert_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_ycc_extbgr_convert_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_ycc_extbgrx_convert_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_ycc_extxbgr_convert_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_ycc_extxrgb_convert_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_ycc_extrgb_convert_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+  jsimd_c_null_convert_dspr2(cinfo->image_width, input_buf, output_buf,
+                             output_row, num_rows, cinfo->num_components);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v2_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  /* FIXME: jsimd_h2v1_downsample_dspr2() fails some of the TJBench tiling
+   * regression tests, probably because the DSPr2 SIMD implementation predates
+   * those tests. */
+#if 0
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_smooth_downsample_dspr2(input_data, output_data,
+                                     compptr->v_samp_factor,
+                                     cinfo->max_v_samp_factor,
+                                     cinfo->smoothing_factor,
+                                     compptr->width_in_blocks,
+                                     cinfo->image_width);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_dspr2(cinfo->image_width, cinfo->max_v_samp_factor,
+                              compptr->v_samp_factor, compptr->width_in_blocks,
+                              input_data, output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_dspr2(cinfo->max_v_samp_factor, cinfo->output_width,
+                            input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  my_upsample_ptr upsample = (my_upsample_ptr)cinfo->upsample;
+
+  jsimd_int_upsample_dspr2(upsample->h_expand[compptr->component_index],
+                           upsample->v_expand[compptr->component_index],
+                           input_data, output_data_ptr, cinfo->output_width,
+                           cinfo->max_v_samp_factor);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_dspr2(cinfo->max_v_samp_factor,
+                                  compptr->downsampled_width, input_data,
+                                  output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v2_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v2_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v2_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v2_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v2_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v2_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*dspr2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, JSAMPLE *);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    dspr2fct = jsimd_h2v1_extrgbx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGR:
+    dspr2fct = jsimd_h2v1_extbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    dspr2fct = jsimd_h2v1_extbgrx_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    dspr2fct = jsimd_h2v1_extxbgr_merged_upsample_dspr2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    dspr2fct = jsimd_h2v1_extxrgb_merged_upsample_dspr2;
+    break;
+  default:
+    dspr2fct = jsimd_h2v1_extrgb_merged_upsample_dspr2;
+    break;
+  }
+
+  dspr2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf,
+           cinfo->sample_range_limit);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_dspr2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_convsamp_float_dspr2(sample_data, start_col, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_dspr2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#ifndef __mips_soft_float
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_dspr2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+#ifndef __mips_soft_float
+  jsimd_quantize_float_dspr2(coef_block, divisors, workspace);
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  init_simd();
+
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  int workspace[DCTSIZE * 4]; /* buffers data between passes */
+
+  jsimd_idct_4x4_dspr2(compptr->dct_table, coef_block, output_buf, output_col,
+                       workspace);
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_6x6_dspr2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int workspace[96];
+  int output[12] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col),
+    (int)(output_buf[8] + output_col),
+    (int)(output_buf[9] + output_col),
+    (int)(output_buf[10] + output_col),
+    (int)(output_buf[11] + output_col)
+  };
+
+  jsimd_idct_12x12_pass1_dspr2(coef_block, compptr->dct_table, workspace);
+  jsimd_idct_12x12_pass2_dspr2(workspace, output);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+#if defined(__MIPSEL__)
+  if (simd_support & JSIMD_DSPR2)
+    return 1;
+#endif
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  int output[8] = {
+    (int)(output_buf[0] + output_col),
+    (int)(output_buf[1] + output_col),
+    (int)(output_buf[2] + output_col),
+    (int)(output_buf[3] + output_col),
+    (int)(output_buf[4] + output_col),
+    (int)(output_buf[5] + output_col),
+    (int)(output_buf[6] + output_col),
+    (int)(output_buf[7] + output_col)
+  };
+
+  jsimd_idct_islow_dspr2(coef_block, compptr->dct_table, output,
+                         IDCT_range_limit(cinfo));
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  JCOEFPTR inptr;
+  IFAST_MULT_TYPE *quantptr;
+  DCTELEM workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns from input, store into work array. */
+
+  inptr = coef_block;
+  quantptr = (IFAST_MULT_TYPE *)compptr->dct_table;
+
+  jsimd_idct_ifast_cols_dspr2(inptr, quantptr, workspace,
+                              mips_idct_ifast_coefs);
+
+  /* Pass 2: process rows from work array, store into output array. */
+  /* Note that we must descale the results by a factor of 8 == 2**3, */
+  /* and also undo the PASS1_BITS scaling. */
+
+  jsimd_idct_ifast_rows_dspr2(workspace, output_buf, output_col,
+                              mips_idct_ifast_coefs);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2.S b/media/libjpeg/simd/mips/jsimd_dspr2.S
new file mode 100644
index 0000000000..c99288a8d1
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2.S
@@ -0,0 +1,4543 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ *                          All Rights Reserved.
+ * Authors:  Teodora Novkovic <teodora.novkovic@imgtec.com>
+ *           Darko Laus       <darko.laus@imgtec.com>
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#include "jsimd_dspr2_asm.h"
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_c_null_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ * 20(sp) = cinfo->num_components
+ *
+ * Null conversion for compression
+ */
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    lw          t9, 24(sp)      /* t9 = num_rows */
+    lw          s0, 28(sp)      /* s0 = cinfo->num_components */
+    andi        t0, a0, 3       /* t0 = cinfo->image_width & 3 */
+    beqz        t0, 4f          /* no residual */
+     nop
+0:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+1:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+2:
+    lbu         t3, 0(t2)
+    addiu       t5, t5, 1
+    sb          t3, -1(t5)
+    bne         t6, t5, 2b
+     addu       t2, t2, s0
+3:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 3b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 1b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 0b
+     addiu      a3, a3, 1
+    b           7f
+     nop
+4:
+    addiu       t9, t9, -1
+    bltz        t9, 7f
+     li         t1, 0
+5:
+    sll         t3, t1, 2
+    lwx         t5, t3(a2)      /* t5 = outptr = output_buf[ci] */
+    lw          t2, 0(a1)       /* t2 = inptr = *input_buf */
+    sll         t4, a3, 2
+    lwx         t5, t4(t5)      /* t5 = outptr = output_buf[ci][output_row] */
+    addu        t2, t2, t1
+    addu        s1, t5, a0
+    addu        t6, t5, t0
+6:
+    lbu         t3, 0(t2)
+    addu        t4, t2, s0
+    addu        t7, t4, s0
+    addu        t8, t7, s0
+    addu        t2, t8, s0
+    lbu         t4, 0(t4)
+    lbu         t7, 0(t7)
+    lbu         t8, 0(t8)
+    addiu       t5, t5, 4
+    sb          t3, -4(t5)
+    sb          t4, -3(t5)
+    sb          t7, -2(t5)
+    bne         s1, t5, 6b
+     sb         t8, -1(t5)
+    addiu       t1, t1, 1
+    bne         t1, s0, 5b
+     nop
+    addiu       a1, a1, 4
+    bgez        t9, 4b
+     addiu      a3, a3, 1
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+
+END(jsimd_c_null_convert_dspr2)
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_ycc_convert_dspr2
+ * jsimd_extbgr_ycc_convert_dspr2
+ * jsimd_extrgbx_ycc_convert_dspr2
+ * jsimd_extbgrx_ycc_convert_dspr2
+ * jsimd_extxbgr_ycc_convert_dspr2
+ * jsimd_extxrgb_ycc_convert_dspr2
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_YCC  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          t7, 48(sp)      /* t7 = num_rows */
+    li          s0, 0x4c8b      /* FIX(0.29900) */
+    li          s1, 0x9646      /* FIX(0.58700) */
+    li          s2, 0x1d2f      /* FIX(0.11400) */
+    li          s3, 0xffffd4cd  /* -FIX(0.16874) */
+    li          s4, 0xffffab33  /* -FIX(0.33126) */
+    li          s5, 0x8000      /* FIX(0.50000) */
+    li          s6, 0xffff94d1  /* -FIX(0.41869) */
+    li          s7, 0xffffeb2f  /* -FIX(0.08131) */
+    li          t8, 0x807fff    /* CBCR_OFFSET + ONE_HALF-1 */
+
+0:
+    addiu       t7, -1          /* --num_rows */
+    lw          t6, 0(a1)       /* t6 = input_buf[0] */
+    lw          t0, 0(a2)
+    lw          t1, 4(a2)
+    lw          t2, 8(a2)
+    sll         t3, a3, 2
+    lwx         t0, t3(t0)      /* t0 = output_buf[0][output_row] */
+    lwx         t1, t3(t1)      /* t1 = output_buf[1][output_row] */
+    lwx         t2, t3(t2)      /* t2 = output_buf[2][output_row] */
+
+    addu        t9, t2, a0      /* t9 = end address */
+    addiu       a3, 1
+
+1:
+    DO_RGB_TO_YCC t3, t4, t5, t6
+
+    mtlo        s5, $ac0
+    mtlo        t8, $ac1
+    mtlo        t8, $ac2
+    maddu       $ac0, s2, t5
+    maddu       $ac1, s5, t5
+    maddu       $ac2, s5, t3
+    maddu       $ac0, s0, t3
+    maddu       $ac1, s3, t3
+    maddu       $ac2, s6, t4
+    maddu       $ac0, s1, t4
+    maddu       $ac1, s4, t4
+    maddu       $ac2, s7, t5
+    extr.w      t3, $ac0, 16
+    extr.w      t4, $ac1, 16
+    extr.w      t5, $ac2, 16
+    sb          t3, 0(t0)
+    sb          t4, 0(t1)
+    sb          t5, 0(t2)
+    addiu       t0, 1
+    addiu       t2, 1
+    bne         t2, t9, 1b
+     addiu      t1, 1
+    bgtz        t7, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_ycc_convert_dspr2)
+
+.purgem DO_RGB_TO_YCC
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B */
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_ycc_extrgb_convert_dspr2
+ * jsimd_ycc_extbgr_convert_dspr2
+ * jsimd_ycc_extrgbx_convert_dspr2
+ * jsimd_ycc_extbgrx_convert_dspr2
+ * jsimd_ycc_extxbgr_convert_dspr2
+ * jsimd_ycc_extxrgb_convert_dspr2
+ *
+ * Colorspace conversion YCbCr -> RGB
+ */
+
+.macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2  colorid, pixel_size, \
+                                             r_offs, g_offs, b_offs, a_offs
+
+.macro STORE_YCC_TO_RGB  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r_offs(\outptr)
+    sb          \scratch1, \g_offs(\outptr)
+    sb          \scratch2, \b_offs(\outptr)
+.if (\pixel_size == 4)
+    li          t0, 0xFF
+    sb          t0, \a_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = input_row
+ * a3     = output_buf
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s1, 48(sp)
+    li          t3, 0x8000
+    li          t4, 0x166e9     /* FIX(1.40200) */
+    li          t5, 0x1c5a2     /* FIX(1.77200) */
+    li          t6, 0xffff492e  /* -FIX(0.71414) */
+    li          t7, 0xffffa7e6  /* -FIX(0.34414) */
+    repl.ph     t8, 128
+
+0:
+    lw          s0, 0(a3)
+    lw          t0, 0(a1)
+    lw          t1, 4(a1)
+    lw          t2, 8(a1)
+    sll         s5, a2, 2
+    addiu       s1, -1
+    lwx         s2, s5(t0)
+    lwx         s3, s5(t1)
+    lwx         s4, s5(t2)
+    addu        t9, s2, a0
+    addiu       a2, 1
+
+1:
+    lbu         s7, 0(s4)       /* cr */
+    lbu         s6, 0(s3)       /* cb */
+    lbu         s5, 0(s2)       /* y */
+    addiu       s2, 1
+    addiu       s4, 1
+    addiu       s7, -128
+    addiu       s6, -128
+    mul         t2, t7, s6
+    mul         t0, t6, s7      /* Crgtab[cr] */
+    sll         s7, 15
+    mulq_rs.w   t1, t4, s7      /* Crrtab[cr] */
+    sll         s6, 15
+    addu        t2, t3          /* Cbgtab[cb] */
+    addu        t2, t0
+
+    mulq_rs.w   t0, t5, s6      /* Cbbtab[cb] */
+    sra         t2, 16
+    addu        t1, s5
+    addu        t2, s5          /* add y */
+    ins         t2, t1, 16, 16
+    subu.ph     t2, t2, t8
+    addu        t0, s5
+    shll_s.ph   t2, t2, 8
+    subu        t0, 128
+    shra.ph     t2, t2, 8
+    shll_s.w    t0, t0, 24
+    addu.ph     t2, t2, t8      /* clip & store */
+    sra         t0, t0, 24
+    sra         t1, t2, 16
+    addiu       t0, 128
+
+    STORE_YCC_TO_RGB t1, t2, t0, s0
+
+    bne         s2, t9, 1b
+     addiu      s3, 1
+    bgtz        s1, 0b
+     addiu      a3, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_ycc_\colorid\()_convert_dspr2)
+
+.purgem STORE_YCC_TO_RGB
+
+.endm
+
+/*-------------------------------------id -- pix R  G  B  A */
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb,  3, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr,  3, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
+GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
+
+
+/*****************************************************************************/
+/*
+ * jsimd_extrgb_gray_convert_dspr2
+ * jsimd_extbgr_gray_convert_dspr2
+ * jsimd_extrgbx_gray_convert_dspr2
+ * jsimd_extbgrx_gray_convert_dspr2
+ * jsimd_extxbgr_gray_convert_dspr2
+ * jsimd_extxrgb_gray_convert_dspr2
+ *
+ * Colorspace conversion RGB -> GRAY
+ */
+
+.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2  colorid, pixel_size, \
+                                              r_offs, g_offs, b_offs
+
+.macro DO_RGB_TO_GRAY  r, g, b, inptr
+    lbu         \r, \r_offs(\inptr)
+    lbu         \g, \g_offs(\inptr)
+    lbu         \b, \b_offs(\inptr)
+    addiu       \inptr, \pixel_size
+.endm
+
+LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = input_buf
+ * a2     = output_buf
+ * a3     = output_row
+ * 16(sp) = num_rows
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    li          s0, 0x4c8b      /* s0 = FIX(0.29900) */
+    li          s1, 0x9646      /* s1 = FIX(0.58700) */
+    li          s2, 0x1d2f      /* s2 = FIX(0.11400) */
+    li          s7, 0x8000      /* s7 = FIX(0.50000) */
+    lw          s6, 48(sp)
+    andi        t7, a0, 3
+
+0:
+    addiu       s6, -1          /* s6 = num_rows */
+    lw          t0, 0(a1)
+    lw          t1, 0(a2)
+    sll         t3, a3, 2
+    lwx         t1, t3(t1)
+    addiu       a3, 1
+    addu        t9, t1, a0
+    subu        t8, t9, t7
+    beq         t1, t8, 2f
+     nop
+
+1:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t6, $ac0, 16
+
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+    DO_RGB_TO_GRAY s3, s4, s5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    extr.w      t2, $ac1, 16
+    maddu       $ac0, s0, t3
+    mtlo        s7, $ac1
+    maddu       $ac1, s2, s5
+    maddu       $ac1, s1, s4
+    maddu       $ac1, s0, s3
+    extr.w      t5, $ac0, 16
+    sb          t6, 0(t1)
+    sb          t2, 1(t1)
+    extr.w      t3, $ac1, 16
+    addiu       t1, 4
+    sb          t5, -2(t1)
+    sb          t3, -1(t1)
+    bne         t1, t8, 1b
+     nop
+
+2:
+    beqz        t7, 4f
+     nop
+
+3:
+    DO_RGB_TO_GRAY t3, t4, t5, t0
+
+    mtlo        s7, $ac0
+    maddu       $ac0, s2, t5
+    maddu       $ac0, s1, t4
+    maddu       $ac0, s0, t3
+    extr.w      t6, $ac0, 16
+    sb          t6, 0(t1)
+    addiu       t1, 1
+    bne         t1, t9, 3b
+     nop
+
+4:
+    bgtz        s6, 0b
+     addiu      a1, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_\colorid\()_gray_convert_dspr2)
+
+.purgem DO_RGB_TO_GRAY
+
+.endm
+
+/*-------------------------------------id --  pix R  G  B */
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb,  3, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr,  3, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
+GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_merged_upsample_dspr2
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v2 upsample routines
+ */
+.macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V2_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          \scratch0, 0xFF
+    sb          \scratch0, \a1_offs(\outptr)
+    sb          \scratch0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V2_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = cinfo->sample_range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    lw          t9, 56(sp)      /* cinfo->sample_range_limit */
+    lw          v0, 0(a1)
+    lw          v1, 4(a1)
+    lw          t0, 8(a1)
+    sll         t1, a2, 3
+    addiu       t2, t1, 4
+    sll         t3, a2, 2
+    lw          t4, 0(a3)       /* t4 = output_buf[0] */
+    lwx         t1, t1(v0)      /* t1 = input_buf[0][in_row_group_ctr*2] */
+    lwx         t2, t2(v0)      /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
+    lwx         t5, t3(v1)      /* t5 = input_buf[1][in_row_group_ctr] */
+    lwx         t6, t3(t0)      /* t6 = input_buf[2][in_row_group_ctr] */
+    lw          t7, 4(a3)       /* t7 = output_buf[1] */
+    li          s1, 0xe6ea
+    addiu       t8, s1, 0x7fff    /* t8 = 0x166e9 [FIX(1.40200)] */
+    addiu       s0, t8, 0x5eb9    /* s0 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s1, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s2, s1, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t3, a0, 1
+    blez        t3, 2f
+     addu       t0, t5, t3      /* t0 = end address */
+ 1:
+    lbu         t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t5, t5, 1
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    lbu         v0, 0(t1)
+    addiu       t6, t6, 1
+    addiu       t1, t1, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, -1(t1)
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         AT, 0(t3)
+    lbu         s7, 0(s3)
+    lbu         ra, 0(v1)
+    lbu         v0, 1(t2)
+    addiu       t2, t2, 2
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
+
+    bne         t0, t5, 1b
+     nop
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     lbu        t3, 0(t5)
+    lbu         s3, 0(t6)
+    addiu       t3, t3, -128    /* (cb - 128) */
+    addiu       s3, s3, -128    /* (cr - 128) */
+    mult        $ac1, s1, t3
+    madd        $ac1, s2, s3
+    sll         s3, s3, 15
+    sll         t3, t3, 15
+    lbu         v0, 0(t1)
+    extr_r.w    s5, $ac1, 16
+    mulq_rs.w   s4, t8, s3      /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   s6, s0, t3      /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+    lbu         v0, 0(t2)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t4
+
+    addu        t3, v0, s4      /* y+cred */
+    addu        s3, v0, s5      /* y+cgreen */
+    addu        v1, v0, s6      /* y+cblue */
+    addu        t3, t9, t3      /* y+cred */
+    addu        s3, t9, s3      /* y+cgreen */
+    addu        v1, t9, v1      /* y+cblue */
+    lbu         t3, 0(t3)
+    lbu         s3, 0(s3)
+    lbu         v1, 0(v1)
+
+    STORE_H2V2_1_PIXEL t3, s3, v1, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V2_1_PIXEL
+.purgem STORE_H2V2_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v1_merged_upsample_dspr2
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
+ *
+ * Merged h2v1 upsample routines
+ */
+
+.macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2  colorid, pixel_size, \
+                                            r1_offs, g1_offs, \
+                                            b1_offs, a1_offs, \
+                                            r2_offs, g2_offs, \
+                                            b2_offs, a2_offs
+
+.macro STORE_H2V1_2_PIXELS  scratch0 scratch1 scratch2 scratch3 scratch4 \
+                            scratch5 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+    sb          \scratch3, \r2_offs(\outptr)
+    sb          \scratch4, \g2_offs(\outptr)
+    sb          \scratch5, \b2_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+    sb          t0, \a2_offs(\outptr)
+.endif
+    addiu       \outptr, \pixel_size
+.endm
+
+.macro STORE_H2V1_1_PIXEL  scratch0 scratch1 scratch2 outptr
+    sb          \scratch0, \r1_offs(\outptr)
+    sb          \scratch1, \g1_offs(\outptr)
+    sb          \scratch2, \b1_offs(\outptr)
+.if (\pixel_size == 8)
+    li          t0, 0xFF
+    sb          t0, \a1_offs(\outptr)
+.endif
+.endm
+
+LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+/*
+ * a0     = cinfo->output_width
+ * a1     = input_buf
+ * a2     = in_row_group_ctr
+ * a3     = output_buf
+ * 16(sp) = range_limit
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    li          t0, 0xe6ea
+    lw          t1, 0(a1)         /* t1 = input_buf[0] */
+    lw          t2, 4(a1)         /* t2 = input_buf[1] */
+    lw          t3, 8(a1)         /* t3 = input_buf[2] */
+    lw          t8, 56(sp)        /* t8 = range_limit */
+    addiu       s1, t0, 0x7fff    /* s1 = 0x166e9 [FIX(1.40200)] */
+    addiu       s2, s1, 0x5eb9    /* s2 = 0x1c5a2 [FIX(1.77200)] */
+    addiu       s0, t0, 0x9916    /* s0 = 0x8000 */
+    addiu       s4, zero, 0xa7e6  /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
+    xori        s3, s4, 0xeec8    /* s3 = 0xffff492e [-FIX(0.71414)] */
+    srl         t0, a0, 1
+    sll         t4, a2, 2
+    lwx         s5, t4(t1)      /* s5 = inptr0 */
+    lwx         s6, t4(t2)      /* s6 = inptr1 */
+    lwx         s7, t4(t3)      /* s7 = inptr2 */
+    lw          t7, 0(a3)       /* t7 = outptr */
+    blez        t0, 2f
+     addu       t9, s6, t0      /* t9 = end address */
+1:
+    lbu         t2, 0(s6)       /* t2 = cb */
+    lbu         t0, 0(s7)       /* t0 = cr */
+    lbu         t1, 0(s5)       /* t1 = y */
+    addiu       t2, t2, -128    /* t2 = cb - 128 */
+    addiu       t0, t0, -128    /* t0 = cr - 128 */
+    mult        $ac1, s4, t2
+    madd        $ac1, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
+    extr_r.w    t5, $ac1, 16
+    mulq_rs.w   t6, s2, t2      /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
+    addiu       s7, s7, 1
+    addiu       s6, s6, 1
+    addu        t2, t1, t0      /* t2 = y + cred */
+    addu        t3, t1, t5      /* t3 = y + cgreen */
+    addu        t4, t1, t6      /* t4 = y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t1, 1(s5)
+    lbu         v0, 0(t2)
+    lbu         v1, 0(t3)
+    lbu         ra, 0(t4)
+    addu        t2, t1, t0
+    addu        t3, t1, t5
+    addu        t4, t1, t6
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
+
+    bne         t9, s6, 1b
+     addiu      s5, s5, 2
+2:
+    andi        t0, a0, 1
+    beqz        t0, 4f
+     nop
+3:
+    lbu         t2, 0(s6)
+    lbu         t0, 0(s7)
+    lbu         t1, 0(s5)
+    addiu       t2, t2, -128    /* (cb - 128) */
+    addiu       t0, t0, -128    /* (cr - 128) */
+    mul         t3, s4, t2
+    mul         t4, s3, t0
+    sll         t0, t0, 15
+    sll         t2, t2, 15
+    mulq_rs.w   t0, s1, t0      /* (C1*cr + ONE_HALF)>> SCALEBITS */
+    mulq_rs.w   t6, s2, t2      /* (C2*cb + ONE_HALF)>> SCALEBITS */
+    addu        t3, t3, s0
+    addu        t3, t4, t3
+    sra         t5, t3, 16      /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
+    addu        t2, t1, t0      /* y + cred */
+    addu        t3, t1, t5      /* y + cgreen */
+    addu        t4, t1, t6      /* y + cblue */
+    addu        t2, t8, t2
+    addu        t3, t8, t3
+    addu        t4, t8, t4
+    lbu         t2, 0(t2)
+    lbu         t3, 0(t3)
+    lbu         t4, 0(t4)
+
+    STORE_H2V1_1_PIXEL t2, t3, t4, t7
+4:
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
+
+    j           ra
+     nop
+
+END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
+
+.purgem STORE_H2V1_1_PIXEL
+.purgem STORE_H2V1_2_PIXELS
+.endm
+
+/*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
+GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
+
+
+/*****************************************************************************/
+/*
+ * jsimd_h2v2_fancy_upsample_dspr2
+ *
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+ */
+LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    li            s4, 0
+    lw            s2, 0(a3)       /* s2 = *output_data_ptr */
+0:
+    li            t9, 2
+    lw            s1, -4(a2)      /* s1 = inptr1 */
+
+1:
+    lw            s0, 0(a2)       /* s0 = inptr0 */
+    lwx           s3, s4(s2)
+    addiu         s5, a1, -2      /* s5 = downsampled_width - 2 */
+    srl           t4, s5, 1
+    sll           t4, t4, 1
+    lbu           t0, 0(s0)
+    lbu           t1, 1(s0)
+    lbu           t2, 0(s1)
+    lbu           t3, 1(s1)
+    addiu         s0, 2
+    addiu         s1, 2
+    addu          t8, s0, t4      /* t8 = end address */
+    andi          s5, s5, 1       /* s5 = residual */
+    sll           t4, t0, 1
+    sll           t6, t1, 1
+    addu          t0, t0, t4      /* t0 = (*inptr0++) * 3 */
+    addu          t1, t1, t6      /* t1 = (*inptr0++) * 3 */
+    addu          t7, t0, t2      /* t7 = thiscolsum */
+    addu          t6, t1, t3      /* t5 = nextcolsum */
+    sll           t0, t7, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t7      /* t1 = thiscolsum * 3 */
+    shra_r.w      t0, t0, 4
+    addiu         t1, 7
+    addu          t1, t1, t6
+    srl           t1, t1, 4
+    sb            t0, 0(s3)
+    sb            t1, 1(s3)
+    beq           t8, s0, 22f     /* skip to final iteration if width == 3 */
+     addiu        s3, 2
+2:
+    lh            t0, 0(s0)       /* t0 = A3|A2 */
+    lh            t2, 0(s1)       /* t2 = B3|B2 */
+    addiu         s0, 2
+    addiu         s1, 2
+    preceu.ph.qbr t0, t0          /* t0 = 0|A3|0|A2 */
+    preceu.ph.qbr t2, t2          /* t2 = 0|B3|0|B2 */
+    shll.ph       t1, t0, 1
+    sll           t3, t6, 1
+    addu.ph       t0, t1, t0      /* t0 = A3*3|A2*3 */
+    addu          t3, t3, t6      /* t3 = this * 3 */
+    addu.ph       t0, t0, t2      /* t0 = next2|next1 */
+    addu          t1, t3, t7
+    andi          t7, t0, 0xFFFF  /* t7 = next1 */
+    sll           t2, t7, 1
+    addu          t2, t7, t2      /* t2 = next1*3 */
+    addu          t4, t2, t6
+    srl           t6, t0, 16      /* t6 = next2 */
+    shra_r.w      t1, t1, 4       /* t1 = (this*3 + last + 8) >> 4 */
+    addu          t0, t3, t7
+    addiu         t0, 7
+    srl           t0, t0, 4       /* t0 = (this*3 + next1 + 7) >> 4 */
+    shra_r.w      t4, t4, 4       /* t3 = (next1*3 + this + 8) >> 4 */
+    addu          t2, t2, t6
+    addiu         t2, 7
+    srl           t2, t2, 4       /* t2 = (next1*3 + next2 + 7) >> 4 */
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    sb            t4, 2(s3)
+    sb            t2, 3(s3)
+    bne           t8, s0, 2b
+     addiu        s3, 4
+22:
+    beqz          s5, 4f
+     addu         t8, s0, s5
+3:
+    lbu           t0, 0(s0)
+    lbu           t2, 0(s1)
+    addiu         s0, 1
+    addiu         s1, 1
+    sll           t3, t6, 1
+    sll           t1, t0, 1
+    addu          t1, t0, t1      /* t1 = inptr0 * 3 */
+    addu          t3, t3, t6      /* t3 = thiscolsum * 3 */
+    addu          t5, t1, t2
+    addu          t1, t3, t7
+    shra_r.w      t1, t1, 4
+    addu          t0, t3, t5
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         s3, 2
+    move          t7, t6
+    bne           t8, s0, 3b
+     move         t6, t5
+4:
+    sll           t0, t6, 2       /* t0 = thiscolsum * 4 */
+    subu          t1, t0, t6      /* t1 = thiscolsum * 3 */
+    addu          t1, t1, t7
+    addiu         s4, 4
+    shra_r.w      t1, t1, 4
+    addiu         t0, 7
+    srl           t0, t0, 4
+    sb            t1, 0(s3)
+    sb            t0, 1(s3)
+    addiu         t9, -1
+    addiu         s3, 2
+    bnez          t9, 1b
+     lw           s1, 4(a2)
+    srl           t0, s4, 2
+    subu          t0, a0, t0
+    bgtz          t0, 0b
+     addiu        a2, 4
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j             ra
+     nop
+END(jsimd_h2v2_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = downsampled_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    .set at
+
+    beqz          a0, 3f
+     sll          t0, a0, 2
+    lw            s1, 0(a3)
+    li            s3, 0x10001
+    addu          s0, s1, t0
+0:
+    addiu         t8, a1, -2
+    srl           t9, t8, 2
+    lw            t7, 0(a2)
+    lw            s2, 0(s1)
+    lbu           t0, 0(t7)
+    lbu           t1, 1(t7)       /* t1 = inptr[1] */
+    sll           t2, t0, 1
+    addu          t2, t2, t0      /* t2 = invalue*3 */
+    addu          t2, t2, t1
+    shra_r.w      t2, t2, 2
+    sb            t0, 0(s2)
+    sb            t2, 1(s2)
+    beqz          t9, 11f
+     addiu        s2, 2
+1:
+    ulw           t0, 0(t7)       /* t0 = |P3|P2|P1|P0| */
+    ulw           t1, 1(t7)
+    ulh           t2, 4(t7)       /* t2 = |0|0|P5|P4| */
+    preceu.ph.qbl t3, t0          /* t3 = |0|P3|0|P2| */
+    preceu.ph.qbr t0, t0          /* t0 = |0|P1|0|P0| */
+    preceu.ph.qbr t2, t2          /* t2 = |0|P5|0|P4| */
+    preceu.ph.qbl t4, t1          /* t4 = |0|P4|0|P3| */
+    preceu.ph.qbr t1, t1          /* t1 = |0|P2|0|P1| */
+    shll.ph       t5, t4, 1
+    shll.ph       t6, t1, 1
+    addu.ph       t5, t5, t4      /* t5 = |P4*3|P3*3| */
+    addu.ph       t6, t6, t1      /* t6 = |P2*3|P1*3| */
+    addu.ph       t4, t3, s3
+    addu.ph       t0, t0, s3
+    addu.ph       t4, t4, t5
+    addu.ph       t0, t0, t6
+    shrl.ph       t4, t4, 2       /* t4 = |0|P3|0|P2| */
+    shrl.ph       t0, t0, 2       /* t0 = |0|P1|0|P0| */
+    addu.ph       t2, t2, t5
+    addu.ph       t3, t3, t6
+    shra_r.ph     t2, t2, 2       /* t2 = |0|P5|0|P4| */
+    shra_r.ph     t3, t3, 2       /* t3 = |0|P3|0|P2| */
+    shll.ph       t2, t2, 8
+    shll.ph       t3, t3, 8
+    or            t2, t4, t2
+    or            t3, t3, t0
+    addiu         t9, -1
+    usw           t3, 0(s2)
+    usw           t2, 4(s2)
+    addiu         s2, 8
+    bgtz          t9, 1b
+     addiu        t7, 4
+11:
+    andi          t8, 3
+    beqz          t8, 22f
+     addiu        t7, 1
+
+2:
+    lbu           t0, 0(t7)
+    addiu         t7, 1
+    sll           t1, t0, 1
+    addu          t2, t0, t1      /* t2 = invalue */
+    lbu           t3, -2(t7)
+    lbu           t4, 0(t7)
+    addiu         t3, 1
+    addiu         t4, 2
+    addu          t3, t3, t2
+    addu          t4, t4, t2
+    srl           t3, 2
+    srl           t4, 2
+    sb            t3, 0(s2)
+    sb            t4, 1(s2)
+    addiu         t8, -1
+    bgtz          t8, 2b
+     addiu        s2, 2
+
+22:
+    lbu           t0, 0(t7)
+    lbu           t2, -1(t7)
+    sll           t1, t0, 1
+    addu          t1, t1, t0      /* t1 = invalue * 3 */
+    addu          t1, t1, t2
+    addiu         t1, 1
+    srl           t1, t1, 2
+    sb            t1, 0(s2)
+    sb            t0, 1(s2)
+    addiu         s1, 4
+    bne           s1, s0, 0b
+     addiu        a2, 4
+3:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j             ra
+     nop
+END(jsimd_h2v1_fancy_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
+
+    beqz        a2, 7f
+     lw         s1, 44(sp)      /* s1 = output_data */
+    lw          s0, 40(sp)      /* s0 = input_data */
+    srl         s2, a0, 2
+    andi        t9, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* t0 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    li          s3, 0           /* s3 = bias */
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+1:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 2(t5)
+    ulhu        t2, 4(t5)
+    ulhu        t3, 6(t5)
+    raddu.w.qb  t0, t0
+    raddu.w.qb  t1, t1
+    raddu.w.qb  t2, t2
+    raddu.w.qb  t3, t3
+    shra.ph     t0, t0, 1
+    shra_r.ph   t1, t1, 1
+    shra.ph     t2, t2, 1
+    shra_r.ph   t3, t3, 1
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t3, 3(t4)
+    addiu       s4, -1
+    addiu       t4, 4
+    bgtz        s4, 1b
+     addiu      t5, 8
+    beqz        t8, 3f
+     addu       s4, t4, t8
+2:
+    ulhu        t0, 0(t5)
+    raddu.w.qb  t0, t0
+    addqh.w     t0, t0, s3
+    xori        s3, s3, 1
+    sb          t0, 0(t4)
+    addiu       t4, 1
+    bne         t4, s4, 2b
+     addiu      t5, 2
+3:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    addqh.w     t2, t1, s3      /* t2 = pixval1 */
+    xori        s3, s3, 1
+    addqh.w     t3, t1, s3      /* t3 = pixval2 */
+    blez        s2, 5f
+     append     t3, t2,  8
+    addu        t5, t4, s2      /* t5 = loop_end2 */
+4:
+    ush         t3, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 4b
+     addiu      t4,  2
+5:
+    beqz        t9, 6f
+     nop
+    sb          t2, 0(t4)
+6:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 4
+7:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
+
+    j           ra
+    nop
+END(jsimd_h2v1_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
+/*
+ * a0     = cinfo->image_width
+ * a1     = cinfo->max_v_samp_factor
+ * a2     = compptr->v_samp_factor
+ * a3     = compptr->width_in_blocks
+ * 16(sp) = input_data
+ * 20(sp) = output_data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    beqz        a2, 8f
+     lw         s1, 52(sp)      /* s1 = output_data */
+    lw          s0, 48(sp)      /* s0 = input_data */
+
+    andi        t6, a0, 1       /* t6 = temp_index */
+    addiu       t6, -1
+    srl         t7, a0, 1       /* t7 = image_width1 */
+    srl         s4, t7, 2
+    andi        t8, t7, 3
+    andi        t9, a0, 2
+    srl         s2, a0, 2
+    srl         t7, t9, 1
+    addu        s2, t7, s2
+    sll         t0, a3, 3       /* s2 = width_in_blocks*DCT */
+    srl         t7, t0, 1
+    subu        s2, t7, s2
+0:
+    lw          t4, 0(s1)       /* t4 = outptr */
+    lw          t5, 0(s0)       /* t5 = inptr0 */
+    lw          s7, 4(s0)       /* s7 = inptr1 */
+    li          s6, 1           /* s6 = bias */
+2:
+    ulw         t0, 0(t5)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t1, 0(s7)       /* t1 = |Q3|Q2|Q1|Q0| */
+    ulw         t2, 4(t5)
+    ulw         t3, 4(s7)
+    precrq.ph.w t7, t0, t1      /* t2 = |P3|P2|Q3|Q2| */
+    ins         t0, t1, 16, 16  /* t0 = |Q1|Q0|P1|P0| */
+    raddu.w.qb  t1, t7
+    raddu.w.qb  t0, t0
+    shra_r.w    t1, t1, 2
+    addiu       t0, 1
+    srl         t0, 2
+    precrq.ph.w t7, t2, t3
+    ins         t2, t3, 16, 16
+    raddu.w.qb  t7, t7
+    raddu.w.qb  t2, t2
+    shra_r.w    t7, t7, 2
+    addiu       t2, 1
+    srl         t2, 2
+    sb          t0, 0(t4)
+    sb          t1, 1(t4)
+    sb          t2, 2(t4)
+    sb          t7, 3(t4)
+    addiu       t4, 4
+    addiu       t5, 8
+    addiu       s4, s4, -1
+    bgtz        s4, 2b
+     addiu      s7, 8
+    beqz        t8, 4f
+     addu       t8, t4, t8
+3:
+    ulhu        t0, 0(t5)
+    ulhu        t1, 0(s7)
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t0, t0
+    addu        t0, t0, s6
+    srl         t0, 2
+    xori        s6, s6, 3
+    sb          t0, 0(t4)
+    addiu       t5, 2
+    addiu       t4, 1
+    bne         t8, t4, 3b
+     addiu      s7, 2
+4:
+    lbux        t1, t6(t5)
+    sll         t1, 1
+    lbux        t0, t6(s7)
+    sll         t0, 1
+    addu        t1, t1, t0
+    addu        t3, t1, s6
+    srl         t0, t3, 2       /* t2 = pixval1 */
+    xori        s6, s6, 3
+    addu        t2, t1, s6
+    srl         t1, t2, 2       /* t3 = pixval2 */
+    blez        s2, 6f
+     append     t1, t0, 8
+5:
+    ush         t1, 0(t4)
+    addiu       s2, -1
+    bgtz        s2, 5b
+     addiu      t4, 2
+6:
+    beqz        t9, 7f
+     nop
+    sb          t0, 0(t4)
+7:
+    addiu       s1, 4
+    addiu       a2, -1
+    bnez        a2, 0b
+     addiu      s0, 8
+8:
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_h2v2_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
+/*
+ * a0     = input_data
+ * a1     = output_data
+ * a2     = compptr->v_samp_factor
+ * a3     = cinfo->max_v_samp_factor
+ * 16(sp) = cinfo->smoothing_factor
+ * 20(sp) = compptr->width_in_blocks
+ * 24(sp) = cinfo->image_width
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          s7, 52(sp)      /* compptr->width_in_blocks */
+    lw          s0, 56(sp)      /* cinfo->image_width */
+    lw          s6, 48(sp)      /* cinfo->smoothing_factor */
+    sll         s7, 3           /* output_cols = width_in_blocks * DCTSIZE */
+    sll         v0, s7, 1
+    subu        v0, v0, s0
+    blez        v0, 2f
+    move        v1, zero
+    addiu       t0, a3, 2       /* t0 = cinfo->max_v_samp_factor + 2 */
+0:
+    addiu       t1, a0, -4
+    sll         t2, v1, 2
+    lwx         t1, t2(t1)
+    move        t3, v0
+    addu        t1, t1, s0
+    lbu         t2, -1(t1)
+1:
+    addiu       t3, t3, -1
+    sb          t2, 0(t1)
+    bgtz        t3, 1b
+    addiu       t1, t1, 1
+    addiu       v1, v1, 1
+    bne         v1, t0, 0b
+    nop
+2:
+    li          v0, 80
+    mul         v0, s6, v0
+    li          v1, 16384
+    move        t4, zero
+    move        t5, zero
+    subu        t6, v1, v0      /* t6 = 16384 - tmp_smoot_f * 80 */
+    sll         t7, s6, 4       /* t7 = tmp_smoot_f * 16 */
+3:
+/* Special case for first column: pretend column -1 is same as column 0 */
+    sll         v0, t4, 2
+    lwx         t8, v0(a1)      /*  outptr = output_data[outrow] */
+    sll         v1, t5, 2
+    addiu       t9, v1, 4
+    addiu       s0, v1, -4
+    addiu       s1, v1, 8
+    lwx         s2, v1(a0)      /* inptr0 = input_data[inrow] */
+    lwx         t9, t9(a0)      /* inptr1 = input_data[inrow+1] */
+    lwx         s0, s0(a0)      /* above_ptr = input_data[inrow-1] */
+    lwx         s1, s1(a0)      /* below_ptr = input_data[inrow+2] */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, 0(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, 0(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, 0(s0)
+    lbu         t0, 0(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    v0, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    addiu       s1, s1, 2
+    sb          v0, -1(t8)
+    addiu       s4, s7, -2
+    and         s4, s4, 3
+    addu        s5, s4, t8      /* end address */
+4:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    addiu       t8, t8, 1
+    addiu       s2, s2, 2
+    addiu       t9, t9, 2
+    addiu       s0, s0, 2
+    sb          t2, -1(t8)
+    bne         s5, t8, 4b
+    addiu       s1, s1, 2
+    addiu       s5, s7, -2
+    subu        s5, s5, s4
+    addu        s5, s5, t8      /* end address */
+5:
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 2(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 2(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 2(s0)
+    addu        t0, t0, v0
+    lbu         t3, 2(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    lh          v1, 2(t9)
+    addu        t0, t0, v0
+    lh          v0, 2(s2)
+    addu        s3, t0, s3
+    lh          t0, 2(s0)
+    lh          t1, 2(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 4(s2)
+    lbu         t0, 1(t9)
+    lbu         t1, 4(t9)
+    sb          t2, 0(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 1(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 4(s0)
+    addu        t0, t0, v0
+    lbu         v0, 1(s0)
+    addu        s3, t0, s3
+    lbu         t0, 1(s1)
+    lbu         t3, 4(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 4(t9)
+    addu        t0, t0, v0
+    lh          v0, 4(s2)
+    addu        s3, t0, s3
+    lh          t0, 4(s0)
+    lh          t1, 4(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t2, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 6(s2)
+    lbu         t0, 3(t9)
+    lbu         t1, 6(t9)
+    sb          t2, 1(t8)
+    raddu.w.qb  t3, v0
+    lbu         v0, 3(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t3, t6
+    addu        v0, v0, v1
+    lbu         t2, 6(s0)
+    addu        t0, t0, v0
+    lbu         v0, 3(s0)
+    addu        s3, t0, s3
+    lbu         t0, 3(s1)
+    lbu         t3, 6(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    lh          v1, 6(t9)
+    addu        t0, t0, v0
+    lh          v0, 6(s2)
+    addu        s3, t0, s3
+    lh          t0, 6(s0)
+    lh          t1, 6(s1)
+    madd        $ac1, s3, t7
+    extr_r.w    t3, $ac1, 16
+    ins         t0, t1, 16, 16
+    ins         v0, v1, 16, 16
+    raddu.w.qb  s3, t0
+    lbu         v1, 8(s2)
+    lbu         t0, 5(t9)
+    lbu         t1, 8(t9)
+    sb          t3, 2(t8)
+    raddu.w.qb  t2, v0
+    lbu         v0, 5(s2)
+    addu        t0, t0, t1
+    mult        $ac1, t2, t6
+    addu        v0, v0, v1
+    lbu         t2, 8(s0)
+    addu        t0, t0, v0
+    lbu         v0, 5(s0)
+    addu        s3, t0, s3
+    lbu         t0, 5(s1)
+    lbu         t3, 8(s1)
+    addu        v0, v0, t2
+    sll         s3, s3, 1
+    addu        t0, t0, t3
+    addiu       t8, t8, 4
+    addu        t0, t0, v0
+    addiu       s2, s2, 8
+    addu        s3, t0, s3
+    addiu       t9, t9, 8
+    madd        $ac1, s3, t7
+    extr_r.w    t1, $ac1, 16
+    addiu       s0, s0, 8
+    addiu       s1, s1, 8
+    bne         s5, t8, 5b
+    sb          t1, -1(t8)
+/* Special case for last column */
+    lh          v0, 0(s2)
+    lh          v1, 0(t9)
+    lh          t0, 0(s0)
+    lh          t1, 0(s1)
+    ins         v0, v1, 16, 16
+    ins         t0, t1, 16, 16
+    raddu.w.qb  t2, v0
+    raddu.w.qb  s3, t0
+    lbu         v0, -1(s2)
+    lbu         v1, 1(s2)
+    lbu         t0, -1(t9)
+    lbu         t1, 1(t9)
+    addu        v0, v0, v1
+    mult        $ac1, t2, t6
+    addu        t0, t0, t1
+    lbu         t2, 1(s0)
+    addu        t0, t0, v0
+    lbu         t3, 1(s1)
+    addu        s3, t0, s3
+    lbu         v0, -1(s0)
+    lbu         t0, -1(s1)
+    sll         s3, s3, 1
+    addu        v0, v0, t2
+    addu        t0, t0, t3
+    addu        t0, t0, v0
+    addu        s3, t0, s3
+    madd        $ac1, s3, t7
+    extr_r.w    t0, $ac1, 16
+    addiu       t5, t5, 2
+    sb          t0, 0(t8)
+    addiu       t4, t4, 1
+    bne         t4, a2, 3b
+    addiu       t5, t5, 2
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_h2v2_smooth_downsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_int_upsample_dspr2)
+/*
+ * a0     = upsample->h_expand[compptr->component_index]
+ * a1     = upsample->v_expand[compptr->component_index]
+ * a2     = input_data
+ * a3     = output_data_ptr
+ * 16(sp) = cinfo->output_width
+ * 20(sp) = cinfo->max_v_samp_factor
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    lw          s0, 0(a3)       /* s0 = output_data */
+    lw          s1, 32(sp)      /* s1 = cinfo->output_width */
+    lw          s2, 36(sp)      /* s2 = cinfo->max_v_samp_factor */
+    li          t6, 0           /* t6 = inrow */
+    beqz        s2, 10f
+     li         s3, 0           /* s3 = outrow */
+0:
+    addu        t0, a2, t6
+    addu        t7, s0, s3
+    lw          t3, 0(t0)       /* t3 = inptr */
+    lw          t8, 0(t7)       /* t8 = outptr */
+    beqz        s1, 4f
+     addu       t5, t8, s1      /* t5 = outend */
+1:
+    lb          t2, 0(t3)       /* t2 = invalue = *inptr++ */
+    addiu       t3, 1
+    beqz        a0, 3f
+     move       t0, a0          /* t0 = h_expand */
+2:
+    sb          t2, 0(t8)
+    addiu       t0, -1
+    bgtz        t0, 2b
+     addiu      t8, 1
+3:
+    bgt         t5, t8, 1b
+     nop
+4:
+    addiu       t9, a1, -1      /* t9 = v_expand - 1 */
+    blez        t9, 9f
+     nop
+5:
+    lw          t3, 0(s0)
+    lw          t4, 4(s0)
+    subu        t0, s1, 0xF
+    blez        t0, 7f
+     addu       t5, t3, s1      /* t5 = end address */
+    andi        t7, s1, 0xF     /* t7 = residual */
+    subu        t8, t5, t7
+6:
+    ulw         t0, 0(t3)
+    ulw         t1, 4(t3)
+    ulw         t2, 8(t3)
+    usw         t0, 0(t4)
+    ulw         t0, 12(t3)
+    usw         t1, 4(t4)
+    usw         t2, 8(t4)
+    usw         t0, 12(t4)
+    addiu       t3, 16
+    bne         t3, t8, 6b
+     addiu      t4, 16
+    beqz        t7, 8f
+     nop
+7:
+    lbu         t0, 0(t3)
+    sb          t0, 0(t4)
+    addiu       t3, 1
+    bne         t3, t5, 7b
+     addiu      t4, 1
+8:
+    addiu       t9, -1
+    bgtz        t9, 5b
+     addiu      s0, 8
+9:
+    addu        s3, s3, a1
+    bne         s3, s2, 0b
+     addiu      t6, 1
+10:
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+END(jsimd_int_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)       /* t7 = output_data */
+    andi        t8, a1, 0xf     /* t8 = residual */
+    sll         t0, a0, 2
+    blez        a0, 4f
+     addu       t9, t7, t0      /* t9 = output_data end address */
+0:
+    lw          t5, 0(t7)       /* t5 = outptr */
+    lw          t6, 0(a2)       /* t6 = inptr */
+    addu        t3, t5, a1      /* t3 = outptr + output_width (end address) */
+    subu        t3, t8          /* t3 = end address - residual */
+    beq         t5, t3, 2f
+     move       t4, t8
+1:
+    ulw         t0, 0(t6)       /* t0 = |P3|P2|P1|P0| */
+    ulw         t2, 4(t6)       /* t2 = |P7|P6|P5|P4| */
+    srl         t1, t0, 16      /* t1 = |X|X|P3|P2| */
+    ins         t0, t0, 16, 16  /* t0 = |P1|P0|P1|P0| */
+    ins         t1, t1, 16, 16  /* t1 = |P3|P2|P3|P2| */
+    ins         t0, t0, 8, 16   /* t0 = |P1|P1|P0|P0| */
+    ins         t1, t1, 8, 16   /* t1 = |P3|P3|P2|P2| */
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t0, t2, 16      /* t0 = |X|X|P7|P6| */
+    ins         t2, t2, 16, 16  /* t2 = |P5|P4|P5|P4| */
+    ins         t0, t0, 16, 16  /* t0 = |P7|P6|P7|P6| */
+    ins         t2, t2, 8, 16   /* t2 = |P5|P5|P4|P4| */
+    ins         t0, t0, 8, 16   /* t0 = |P7|P7|P6|P6| */
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t5, 16
+    bne         t5, t3, 1b
+     addiu      t6, 8
+    beqz        t8, 3f
+     move       t4, t8
+2:
+    lbu         t1, 0(t6)
+    sb          t1, 0(t5)
+    sb          t1, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    addiu       t7, 4
+    bne         t9, t7, 0b
+     addiu      a2, 4
+4:
+    j           ra
+     nop
+END(jsimd_h2v1_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
+/*
+ * a0 = cinfo->max_v_samp_factor
+ * a1 = cinfo->output_width
+ * a2 = input_data
+ * a3 = output_data_ptr
+ */
+    lw          t7, 0(a3)
+    blez        a0, 7f
+     andi       t9, a1, 0xf     /* t9 = residual */
+0:
+    lw          t6, 0(a2)       /* t6 = inptr */
+    lw          t5, 0(t7)       /* t5 = outptr */
+    addu        t8, t5, a1      /* t8 = outptr end address */
+    subu        t8, t9          /* t8 = end address - residual */
+    beq         t5, t8, 2f
+     move       t4, t9
+1:
+    ulw         t0, 0(t6)
+    srl         t1, t0, 16
+    ins         t0, t0, 16, 16
+    ins         t0, t0, 8, 16
+    ins         t1, t1, 16, 16
+    ins         t1, t1, 8, 16
+    ulw         t2, 4(t6)
+    usw         t0, 0(t5)
+    usw         t1, 4(t5)
+    srl         t3, t2, 16
+    ins         t2, t2, 16, 16
+    ins         t2, t2, 8, 16
+    ins         t3, t3, 16, 16
+    ins         t3, t3, 8, 16
+    usw         t2, 8(t5)
+    usw         t3, 12(t5)
+    addiu       t5, 16
+    bne         t5, t8, 1b
+     addiu      t6, 8
+    beqz        t9, 3f
+     move       t4, t9
+2:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    sb          t0, 1(t5)
+    addiu       t4, -2
+    addiu       t6, 1
+    bgtz        t4, 2b
+     addiu      t5, 2
+3:
+    lw          t6, 0(t7)       /* t6 = outptr[0] */
+    lw          t5, 4(t7)       /* t5 = outptr[1] */
+    addu        t4, t6, a1      /* t4 = new end address */
+    beq         a1, t9, 5f
+     subu       t8, t4, t9
+4:
+    ulw         t0, 0(t6)
+    ulw         t1, 4(t6)
+    ulw         t2, 8(t6)
+    usw         t0, 0(t5)
+    ulw         t0, 12(t6)
+    usw         t1, 4(t5)
+    usw         t2, 8(t5)
+    usw         t0, 12(t5)
+    addiu       t6, 16
+    bne         t6, t8, 4b
+     addiu      t5, 16
+    beqz        t9, 6f
+     nop
+5:
+    lbu         t0, 0(t6)
+    sb          t0, 0(t5)
+    addiu       t6, 1
+    bne         t6, t4, 5b
+     addiu      t5, 1
+6:
+    addiu       t7, 8
+    addiu       a0, -2
+    bgtz        a0, 0b
+     addiu      a2, 4
+7:
+    j           ra
+     nop
+END(jsimd_h2v2_upsample_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_islow_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = compptr->dcttable
+ * a2 = output
+ * a3 = range_limit
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -256
+    move        v0, sp
+    addiu       v1, zero, 8     /* v1 = DCTSIZE = 8 */
+1:
+    lh          s4, 32(a0)      /* s4 = inptr[16] */
+    lh          s5, 64(a0)      /* s5 = inptr[32] */
+    lh          s6, 96(a0)      /* s6 = inptr[48] */
+    lh          t1, 112(a0)     /* t1 = inptr[56] */
+    lh          t7, 16(a0)      /* t7 = inptr[8] */
+    lh          t5, 80(a0)      /* t5 = inptr[40] */
+    lh          t3, 48(a0)      /* t3 = inptr[24] */
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, t5
+    or          s4, s4, t7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 2f
+     addiu      v1, v1, -1
+    lh          s5, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lh          s6, 0(a0)       /* inptr[DCTSIZE*0] */
+    mul         s5, s5, s6      /* DEQUANTIZE(inptr[0], quantptr[0]) */
+    sll         s5, s5, 2
+    sw          s5, 0(v0)
+    sw          s5, 32(v0)
+    sw          s5, 64(v0)
+    sw          s5, 96(v0)
+    sw          s5, 128(v0)
+    sw          s5, 160(v0)
+    sw          s5, 192(v0)
+    b           3f
+     sw         s5, 224(v0)
+2:
+    lh          t0, 112(a1)
+    lh          t2, 48(a1)
+    lh          t4, 80(a1)
+    lh          t6, 16(a1)
+    mul         t0, t0, t1      /* DEQUANTIZE(inptr[DCTSIZE*7],
+                                              quantptr[DCTSIZE*7]) */
+    mul         t1, t2, t3      /* DEQUANTIZE(inptr[DCTSIZE*3],
+                                              quantptr[DCTSIZE*3]) */
+    mul         t2, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*5],
+                                              quantptr[DCTSIZE*5]) */
+    mul         t3, t6, t7      /* DEQUANTIZE(inptr[DCTSIZE*1],
+                                              quantptr[DCTSIZE*1]) */
+    lh          t4, 32(a1)
+    lh          t5, 32(a0)
+    lh          t6, 96(a1)
+    lh          t7, 96(a0)
+    addu        s0, t0, t1       /* z3 = tmp0 + tmp2 */
+    addu        s1, t1, t2       /* z2 = tmp1 + tmp2 */
+    addu        s2, t2, t3       /* z4 = tmp1 + tmp3 */
+    addu        s3, s0, s2       /* z3 + z4 */
+    addiu       t9, zero, 9633   /* FIX_1_175875602 */
+    mul         s3, s3, t9       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        t8, t0, t3       /* z1 = tmp0 + tmp3 */
+    addiu       t9, zero, 2446   /* FIX_0_298631336 */
+    mul         t0, t0, t9       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t9, zero, 16819  /* FIX_2_053119869 */
+    mul         t2, t2, t9       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t9, zero, 25172  /* FIX_3_072711026 */
+    mul         t1, t1, t9       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t9, zero, 12299  /* FIX_1_501321110 */
+    mul         t3, t3, t9       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    addiu       t9, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t9       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t9, zero, 3196   /* FIX_0_390180644 */
+    mul         s2, s2, t9       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t9, zero, 7373   /* FIX_0_899976223 */
+    mul         t8, t8, t9       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t9, zero, 20995  /* FIX_2_562915447 */
+    mul         s1, s1, t9       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    subu        s0, s3, s0       /* z3 += z5 */
+    addu        t0, t0, s0       /* tmp0 += z3 */
+    addu        t1, t1, s0       /* tmp2 += z3 */
+    subu        s2, s3, s2       /* z4 += z5 */
+    addu        t2, t2, s2       /* tmp1 += z4 */
+    addu        t3, t3, s2       /* tmp3 += z4 */
+    subu        t0, t0, t8       /* tmp0 += z1 */
+    subu        t1, t1, s1       /* tmp2 += z2 */
+    subu        t2, t2, s1       /* tmp1 += z2 */
+    subu        t3, t3, t8       /* tmp3 += z1 */
+    mul         s0, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                               quantptr[DCTSIZE*2]) */
+    addiu       t9, zero, 6270   /* FIX_0_765366865 */
+    mul         s1, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                               quantptr[DCTSIZE*6]) */
+    lh          t4, 0(a1)
+    lh          t5, 0(a0)
+    lh          t6, 64(a1)
+    lh          t7, 64(a0)
+    mul         s2, t9, s0       /* MULTIPLY(z2, FIX_0_765366865) */
+    mul         t5, t4, t5       /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                               quantptr[DCTSIZE*0]) */
+    mul         t6, t6, t7       /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                               quantptr[DCTSIZE*4]) */
+    addiu       t9, zero, 4433   /* FIX_0_541196100 */
+    addu        s3, s0, s1       /* z2 + z3 */
+    mul         s3, s3, t9       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t9, zero, 15137  /* FIX_1_847759065 */
+    mul         t8, s1, t9       /* MULTIPLY(z3, FIX_1_847759065) */
+    addu        t4, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 13      /* tmp0 = (z2 + z3) << CONST_BITS */
+    sll         t5, t5, 13      /* tmp1 = (z2 - z3) << CONST_BITS */
+    addu        t7, s3, s2      /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t6, s3, t8      /* tmp2 =
+                                     z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    addu        s0, t4, t7
+    subu        s1, t4, t7
+    addu        s2, t5, t6
+    subu        s3, t5, t6
+    addu        t4, s0, t3
+    subu        s0, s0, t3
+    addu        t3, s2, t1
+    subu        s2, s2, t1
+    addu        t1, s3, t2
+    subu        s3, s3, t2
+    addu        t2, s1, t0
+    subu        s1, s1, t0
+    shra_r.w    t4, t4, 11
+    shra_r.w    t3, t3, 11
+    shra_r.w    t1, t1, 11
+    shra_r.w    t2, t2, 11
+    shra_r.w    s1, s1, 11
+    shra_r.w    s3, s3, 11
+    shra_r.w    s2, s2, 11
+    shra_r.w    s0, s0, 11
+    sw          t4, 0(v0)
+    sw          t3, 32(v0)
+    sw          t1, 64(v0)
+    sw          t2, 96(v0)
+    sw          s1, 128(v0)
+    sw          s3, 160(v0)
+    sw          s2, 192(v0)
+    sw          s0, 224(v0)
+3:
+    addiu       a1, a1, 2
+    addiu       a0, a0, 2
+    bgtz        v1, 1b
+     addiu      v0, v0, 4
+    move        v0, sp
+    addiu       v1, zero, 8
+4:
+    lw          t0, 8(v0)       /* z2 = (JLONG)wsptr[2] */
+    lw          t1, 24(v0)      /* z3 = (JLONG)wsptr[6] */
+    lw          t2, 0(v0)       /* (JLONG)wsptr[0] */
+    lw          t3, 16(v0)      /* (JLONG)wsptr[4] */
+    lw          s4, 4(v0)       /* (JLONG)wsptr[1] */
+    lw          s5, 12(v0)      /* (JLONG)wsptr[3] */
+    lw          s6, 20(v0)      /* (JLONG)wsptr[5] */
+    lw          s7, 28(v0)      /* (JLONG)wsptr[7] */
+    or          s4, s4, t0
+    or          s4, s4, t1
+    or          s4, s4, t3
+    or          s4, s4, s7
+    or          s4, s4, s5
+    or          s4, s4, s6
+    bnez        s4, 5f
+     addiu      v1, v1, -1
+    shra_r.w    s5, t2, 5
+    andi        s5, s5, 0x3ff
+    lbux        s5, s5(a3)
+    lw          s1, 0(a2)
+    replv.qb    s5, s5
+    usw         s5, 0(s1)
+    usw         s5, 4(s1)
+    b           6f
+     nop
+5:
+    addu        t4, t0, t1       /* z2 + z3 */
+    addiu       t8, zero, 4433   /* FIX_0_541196100 */
+    mul         t5, t4, t8       /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
+    addiu       t8, zero, 15137  /* FIX_1_847759065 */
+    mul         t1, t1, t8       /* MULTIPLY(z3, FIX_1_847759065) */
+    addiu       t8, zero, 6270   /* FIX_0_765366865 */
+    mul         t0, t0, t8       /* MULTIPLY(z2, FIX_0_765366865) */
+    addu        t4, t2, t3       /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
+    subu        t2, t2, t3       /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
+    sll         t4, t4, 13       /* tmp0 =
+                                      (wsptr[0] + wsptr[4]) << CONST_BITS */
+    sll         t2, t2, 13       /* tmp1 =
+                                      (wsptr[0] - wsptr[4]) << CONST_BITS */
+    subu        t1, t5, t1       /* tmp2 =
+                                      z1 + MULTIPLY(z3, -FIX_1_847759065) */
+    subu        t3, t2, t1       /* tmp12 = tmp1 - tmp2 */
+    addu        t2, t2, t1       /* tmp11 = tmp1 + tmp2 */
+    addu        t5, t5, t0       /* tmp3 =
+                                      z1 + MULTIPLY(z2, FIX_0_765366865) */
+    subu        t1, t4, t5       /* tmp13 = tmp0 - tmp3 */
+    addu        t0, t4, t5       /* tmp10 = tmp0 + tmp3 */
+    lw          t4, 28(v0)       /* tmp0 = (JLONG)wsptr[7] */
+    lw          t6, 12(v0)       /* tmp2 = (JLONG)wsptr[3] */
+    lw          t5, 20(v0)       /* tmp1 = (JLONG)wsptr[5] */
+    lw          t7, 4(v0)        /* tmp3 = (JLONG)wsptr[1] */
+    addu        s0, t4, t6       /* z3 = tmp0 + tmp2 */
+    addiu       t8, zero, 9633   /* FIX_1_175875602 */
+    addu        s1, t5, t7       /* z4 = tmp1 + tmp3 */
+    addu        s2, s0, s1       /* z3 + z4 */
+    mul         s2, s2, t8       /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
+    addu        s3, t4, t7       /* z1 = tmp0 + tmp3 */
+    addu        t9, t5, t6       /* z2 = tmp1 + tmp2 */
+    addiu       t8, zero, 16069  /* FIX_1_961570560 */
+    mul         s0, s0, t8       /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
+    addiu       t8, zero, 3196   /* FIX_0_390180644 */
+    mul         s1, s1, t8       /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
+    addiu       t8, zero, 2446   /* FIX_0_298631336 */
+    mul         t4, t4, t8       /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
+    addiu       t8, zero, 7373   /* FIX_0_899976223 */
+    mul         s3, s3, t8       /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
+    addiu       t8, zero, 16819  /* FIX_2_053119869 */
+    mul         t5, t5, t8       /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
+    addiu       t8, zero, 20995  /* FIX_2_562915447 */
+    mul         t9, t9, t8       /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
+    addiu       t8, zero, 25172  /* FIX_3_072711026 */
+    mul         t6, t6, t8       /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
+    addiu       t8, zero, 12299  /* FIX_1_501321110 */
+    mul         t7, t7, t8       /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
+    subu        s0, s2, s0       /* z3 += z5 */
+    subu        s1, s2, s1       /* z4 += z5 */
+    addu        t4, t4, s0
+    subu        t4, t4, s3      /* tmp0 */
+    addu        t5, t5, s1
+    subu        t5, t5, t9      /* tmp1 */
+    addu        t6, t6, s0
+    subu        t6, t6, t9      /* tmp2 */
+    addu        t7, t7, s1
+    subu        t7, t7, s3      /* tmp3 */
+    addu        s0, t0, t7
+    subu        t0, t0, t7
+    addu        t7, t2, t6
+    subu        t2, t2, t6
+    addu        t6, t3, t5
+    subu        t3, t3, t5
+    addu        t5, t1, t4
+    subu        t1, t1, t4
+    shra_r.w    s0, s0, 18
+    shra_r.w    t7, t7, 18
+    shra_r.w    t6, t6, 18
+    shra_r.w    t5, t5, 18
+    shra_r.w    t1, t1, 18
+    shra_r.w    t3, t3, 18
+    shra_r.w    t2, t2, 18
+    shra_r.w    t0, t0, 18
+    andi        s0, s0, 0x3ff
+    andi        t7, t7, 0x3ff
+    andi        t6, t6, 0x3ff
+    andi        t5, t5, 0x3ff
+    andi        t1, t1, 0x3ff
+    andi        t3, t3, 0x3ff
+    andi        t2, t2, 0x3ff
+    andi        t0, t0, 0x3ff
+    lw          s1, 0(a2)
+    lbux        s0, s0(a3)
+    lbux        t7, t7(a3)
+    lbux        t6, t6(a3)
+    lbux        t5, t5(a3)
+    lbux        t1, t1(a3)
+    lbux        t3, t3(a3)
+    lbux        t2, t2(a3)
+    lbux        t0, t0(a3)
+    sb          s0, 0(s1)
+    sb          t7, 1(s1)
+    sb          t6, 2(s1)
+    sb          t5, 3(s1)
+    sb          t1, 4(s1)
+    sb          t3, 5(s1)
+    sb          t2, 6(s1)
+    sb          t0, 7(s1)
+6:
+    addiu       v0, v0, 32
+    bgtz        v1, 4b
+     addiu      a2, a2, 4
+    addiu       sp, sp, 256
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_islow_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
+/*
+ * a0 = inptr
+ * a1 = quantptr
+ * a2 = wsptr
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu         t9, a0, 16      /* end address */
+    or            AT, a3, zero
+
+0:
+    lw            s0, 0(a1)       /* quantptr[DCTSIZE*0] */
+    lw            t0, 0(a0)       /* inptr[DCTSIZE*0] */
+    lw            t1, 16(a0)      /* inptr[DCTSIZE*1] */
+    muleq_s.w.phl v0, t0, s0      /* tmp0 ... */
+    lw            t2, 32(a0)      /* inptr[DCTSIZE*2] */
+    lw            t3, 48(a0)      /* inptr[DCTSIZE*3] */
+    lw            t4, 64(a0)      /* inptr[DCTSIZE*4] */
+    lw            t5, 80(a0)      /* inptr[DCTSIZE*5] */
+    muleq_s.w.phr t0, t0, s0      /* ... tmp0 ... */
+    lw            t6, 96(a0)      /* inptr[DCTSIZE*6] */
+    lw            t7, 112(a0)     /* inptr[DCTSIZE*7] */
+    or            s4, t1, t2
+    or            s5, t3, t4
+    bnez          s4, 1f
+     ins          t0, v0, 16, 16  /* ... tmp0 */
+    bnez          s5, 1f
+     or           s6, t5, t6
+    or            s6, s6, t7
+    bnez          s6, 1f
+     sw           t0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    sw            t0, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            t0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            t0, 48(a2)      /* wsptr[DCTSIZE*3] */
+    sw            t0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            t0, 80(a2)      /* wsptr[DCTSIZE*5] */
+    sw            t0, 96(a2)      /* wsptr[DCTSIZE*6] */
+    sw            t0, 112(a2)     /* wsptr[DCTSIZE*7] */
+    addiu         a0, a0, 4
+    b             2f
+     addiu        a1, a1, 4
+
+1:
+    lw            s1, 32(a1)      /* quantptr[DCTSIZE*2] */
+    lw            s2, 64(a1)      /* quantptr[DCTSIZE*4] */
+    muleq_s.w.phl v0, t2, s1      /* tmp1 ... */
+    muleq_s.w.phr t2, t2, s1      /* ... tmp1 ... */
+    lw            s0, 16(a1)      /* quantptr[DCTSIZE*1] */
+    lw            s1, 48(a1)      /* quantptr[DCTSIZE*3] */
+    lw            s3, 96(a1)      /* quantptr[DCTSIZE*6] */
+    muleq_s.w.phl v1, t4, s2      /* tmp2 ... */
+    muleq_s.w.phr t4, t4, s2      /* ... tmp2 ... */
+    lw            s2, 80(a1)      /* quantptr[DCTSIZE*5] */
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    ins           t2, v0, 16, 16  /* ... tmp1 */
+    muleq_s.w.phl v0, t6, s3      /* tmp3 ... */
+    muleq_s.w.phr t6, t6, s3      /* ... tmp3 ... */
+    ins           t4, v1, 16, 16  /* ... tmp2 */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    ins           t6, v0, 16, 16  /* ... tmp3 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    muleq_s.w.phl v0, t1, s0      /* tmp4 ... */
+    muleq_s.w.phr t1, t1, s0      /* ... tmp4 ... */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    lw            s3, 112(a1)     /* quantptr[DCTSIZE*7] */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    muleq_s.w.phl v1, t7, s3      /* tmp7 ... */
+    muleq_s.w.phr t7, t7, s3      /* ... tmp7 ... */
+    ins           t1, v0, 16, 16  /* ... tmp4 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    muleq_s.w.phl v0, t5, s2      /* tmp6 ... */
+    muleq_s.w.phr t5, t5, s2      /* ... tmp6 ... */
+    ins           t7, v1, 16, 16  /* ... tmp7 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    muleq_s.w.phl v1, t3, s1      /* tmp5 ... */
+    muleq_s.w.phr t3, t3, s1      /* ... tmp5 ... */
+    ins           t5, v0, 16, 16  /* ... tmp6 */
+    ins           t3, v1, 16, 16  /* ... tmp5 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7
+    subq.ph       s1, t0, t7
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    sw            s0, 0(a2)       /* wsptr[DCTSIZE*0] */
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 4
+    addiu         a1, a1, 4
+    sw            s1, 112(a2)     /* wsptr[DCTSIZE*7] */
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    addq.ph       s2, t2, t5
+    addq.ph       t1, s4, t3      /* tmp4 */
+    subq.ph       s3, t2, t5
+    sw            s2, 16(a2)      /* wsptr[DCTSIZE*1] */
+    sw            s3, 96(a2)      /* wsptr[DCTSIZE*6] */
+    addq.ph       v0, t4, t3
+    subq.ph       v1, t4, t3
+    sw            v0, 32(a2)      /* wsptr[DCTSIZE*2] */
+    sw            v1, 80(a2)      /* wsptr[DCTSIZE*5] */
+    addq.ph       v0, t6, t1
+    subq.ph       v1, t6, t1
+    sw            v0, 64(a2)      /* wsptr[DCTSIZE*4] */
+    sw            v1, 48(a2)      /* wsptr[DCTSIZE*3] */
+
+2:
+    bne           a0, t9, 0b
+     addiu        a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_cols_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
+/*
+ * a0 = wsptr
+ * a1 = output_buf
+ * a2 = output_col
+ * a3 = mips_idct_ifast_coefs
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    addiu         t9, a0, 128     /* end address */
+    lui           s8, 0x8080
+    ori           s8, s8, 0x8080
+
+0:
+    lw            AT, 36(sp)      /* restore $a3 (mips_idct_ifast_coefs) */
+    lw            t0, 0(a0)       /* wsptr[DCTSIZE*0+0/1]  b a */
+    lw            s0, 16(a0)      /* wsptr[DCTSIZE*1+0/1]  B A */
+    lw            t2, 4(a0)       /* wsptr[DCTSIZE*0+2/3]  d c */
+    lw            s2, 20(a0)      /* wsptr[DCTSIZE*1+2/3]  D C */
+    lw            t4, 8(a0)       /* wsptr[DCTSIZE*0+4/5]  f e */
+    lw            s4, 24(a0)      /* wsptr[DCTSIZE*1+4/5]  F E */
+    lw            t6, 12(a0)      /* wsptr[DCTSIZE*0+6/7]  h g */
+    lw            s6, 28(a0)      /* wsptr[DCTSIZE*1+6/7]  H G */
+    precrq.ph.w   t1, s0, t0      /* B b */
+    ins           t0, s0, 16, 16  /* A a */
+    bnez          t1, 1f
+     or           s0, t2, s2
+    bnez          s0, 1f
+     or           s0, t4, s4
+    bnez          s0, 1f
+     or           s0, t6, s6
+    bnez          s0, 1f
+     shll_s.ph    s0, t0, 2       /* A a */
+    lw            a3, 0(a1)
+    lw            AT, 4(a1)
+    precrq.ph.w   t0, s0, s0      /* A A */
+    ins           s0, s0, 16, 16  /* a a */
+    addu          a3, a3, a2
+    addu          AT, AT, a2
+    precrq.qb.ph  t0, t0, t0      /* A A A A */
+    precrq.qb.ph  s0, s0, s0      /* a a a a */
+    addu.qb       s0, s0, s8
+    addu.qb       t0, t0, s8
+    sw            s0, 0(a3)
+    sw            s0, 4(a3)
+    sw            t0, 0(AT)
+    sw            t0, 4(AT)
+    addiu         a0, a0, 32
+    bne           a0, t9, 0b
+     addiu        a1, a1, 8
+    b             2f
+     nop
+
+1:
+    precrq.ph.w   t3, s2, t2
+    ins           t2, s2, 16, 16
+    precrq.ph.w   t5, s4, t4
+    ins           t4, s4, 16, 16
+    precrq.ph.w   t7, s6, t6
+    ins           t6, s6, 16, 16
+    lw            t8, 4(AT)       /* FIX(1.414213562) */
+    addq.ph       s4, t0, t4      /* tmp10 */
+    subq.ph       s5, t0, t4      /* tmp11 */
+    subq.ph       s6, t2, t6      /* tmp12 ... */
+    addq.ph       s7, t2, t6      /* tmp13 */
+    mulq_s.ph     s6, s6, t8      /* ... tmp12 ... */
+    addq.ph       t0, s4, s7      /* tmp0 */
+    subq.ph       t6, s4, s7      /* tmp3 */
+    shll_s.ph     s6, s6, 1       /* x2 */
+    subq.ph       s6, s6, s7      /* ... tmp12 */
+    addq.ph       t2, s5, s6      /* tmp1 */
+    subq.ph       t4, s5, s6      /* tmp2 */
+    addq.ph       s5, t1, t7      /* z11 */
+    subq.ph       s6, t1, t7      /* z12 */
+    addq.ph       s7, t5, t3      /* z13 */
+    subq.ph       v0, t5, t3      /* z10 */
+    addq.ph       t7, s5, s7      /* tmp7 */
+    subq.ph       s5, s5, s7      /* tmp11 ... */
+    addq.ph       v1, v0, s6      /* z5 ... */
+    mulq_s.ph     s5, s5, t8      /* ... tmp11 */
+    lw            t8, 8(AT)       /* FIX(1.847759065) */
+    lw            s4, 0(AT)       /* FIX(1.082392200) */
+    addq.ph       s0, t0, t7      /* tmp0 + tmp7 */
+    subq.ph       s7, t0, t7      /* tmp0 - tmp7 */
+    mulq_s.ph     v1, v1, t8      /* ... z5 */
+    lw            a3, 0(a1)
+    lw            t8, 12(AT)      /* FIX(-2.613125930) */
+    shll_s.ph     s5, s5, 1       /* x2 */
+    addu          a3, a3, a2
+    shll_s.ph     v0, v0, 1       /* x4 */
+    mulq_s.ph     v0, v0, t8      /* tmp12 ... */
+    mulq_s.ph     s4, s6, s4      /* tmp10 ... */
+    shll_s.ph     v1, v1, 1       /* x2 */
+    addiu         a0, a0, 32
+    addiu         a1, a1, 8
+    shll_s.ph     s6, v0, 1       /* x4 */
+    shll_s.ph     s4, s4, 1       /* x2 */
+    addq.ph       s6, s6, v1      /* ... tmp12 */
+    shll_s.ph     s0, s0, 2
+    subq.ph       t5, s6, t7      /* tmp6 */
+    subq.ph       s4, s4, v1      /* ... tmp10 */
+    subq.ph       t3, s5, t5      /* tmp5 */
+    shll_s.ph     s7, s7, 2
+    addq.ph       t1, s4, t3      /* tmp4 */
+    addq.ph       s1, t2, t5      /* tmp1 + tmp6 */
+    subq.ph       s6, t2, t5      /* tmp1 - tmp6 */
+    addq.ph       s2, t4, t3      /* tmp2 + tmp5 */
+    subq.ph       s5, t4, t3      /* tmp2 - tmp5 */
+    addq.ph       s4, t6, t1      /* tmp3 + tmp4 */
+    subq.ph       s3, t6, t1      /* tmp3 - tmp4 */
+    shll_s.ph     s1, s1, 2
+    shll_s.ph     s2, s2, 2
+    shll_s.ph     s3, s3, 2
+    shll_s.ph     s4, s4, 2
+    shll_s.ph     s5, s5, 2
+    shll_s.ph     s6, s6, 2
+    precrq.ph.w   t0, s1, s0      /* B A */
+    ins           s0, s1, 16, 16  /* b a */
+    precrq.ph.w   t2, s3, s2      /* D C */
+    ins           s2, s3, 16, 16  /* d c */
+    precrq.ph.w   t4, s5, s4      /* F E */
+    ins           s4, s5, 16, 16  /* f e */
+    precrq.ph.w   t6, s7, s6      /* H G */
+    ins           s6, s7, 16, 16  /* h g */
+    precrq.qb.ph  t0, t2, t0      /* D C B A */
+    precrq.qb.ph  s0, s2, s0      /* d c b a */
+    precrq.qb.ph  t4, t6, t4      /* H G F E */
+    precrq.qb.ph  s4, s6, s4      /* h g f e */
+    addu.qb       s0, s0, s8
+    addu.qb       s4, s4, s8
+    sw            s0, 0(a3)       /* outptr[0/1/2/3]       d c b a */
+    sw            s4, 4(a3)       /* outptr[4/5/6/7]       h g f e */
+    lw            a3, -4(a1)
+    addu.qb       t0, t0, s8
+    addu          a3, a3, a2
+    addu.qb       t4, t4, s8
+    sw            t0, 0(a3)       /* outptr[0/1/2/3]       D C B A */
+    bne           a0, t9, 0b
+     sw           t4, 4(a3)       /* outptr[4/5/6/7]       H G F E */
+
+2:
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
+
+    j             ra
+     nop
+
+END(jsimd_idct_ifast_rows_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_fdct_islow_dspr2)
+/*
+ * a0 = data
+ */
+    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lui         t0, 6437
+    ori         t0, 2260
+    lui         t1, 9633
+    ori         t1, 11363
+    lui         t2, 0xd39e
+    ori         t2, 0xe6dc
+    lui         t3, 0xf72d
+    ori         t3, 9633
+    lui         t4, 2261
+    ori         t4, 9633
+    lui         t5, 0xd39e
+    ori         t5, 6437
+    lui         t6, 9633
+    ori         t6, 0xd39d
+    lui         t7, 0xe6dc
+    ori         t7, 2260
+    lui         t8, 4433
+    ori         t8, 10703
+    lui         t9, 0xd630
+    ori         t9, 4433
+    li          s8, 8
+    move        a1, a0
+1:
+    lw          s0, 0(a1)       /* tmp0 = 1|0 */
+    lw          s1, 4(a1)       /* tmp1 = 3|2 */
+    lw          s2, 8(a1)       /* tmp2 = 5|4 */
+    lw          s3, 12(a1)      /* tmp3 = 7|6 */
+    packrl.ph   s1, s1, s1      /* tmp1 = 2|3 */
+    packrl.ph   s3, s3, s3      /* tmp3 = 6|7 */
+    subq.ph     s7, s1, s2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     s5, s0, s3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t0    /* ac0 += t5*  6437 + t4*  2260 */
+    dpa.w.ph    $ac0, s5, t1    /* ac0 += t6*  9633 + t7* 11363 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t2    /* ac1 += t5*-11362 + t4* -6436 */
+    dpa.w.ph    $ac1, s5, t3    /* ac1 += t6* -2259 + t7*  9633 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, s7, t4    /* ac2 += t5*  2261 + t4*  9633 */
+    dpa.w.ph    $ac2, s5, t5    /* ac2 += t6*-11362 + t7*  6437 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, s7, t6    /* ac3 += t5*  9633 + t4*-11363 */
+    dpa.w.ph    $ac3, s5, t7    /* ac3 += t6* -6436 + t7*  2260 */
+    addq.ph     s6, s1, s2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     s4, s0, s3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    extr_r.w    s2, $ac2, 11    /* tmp2 = (ac2 + 1024) >> 11 */
+    extr_r.w    s3, $ac3, 11    /* tmp3 = (ac3 + 1024) >> 11 */
+    addq.ph     s5, s4, s6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     s7, s4, s6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sh          s0, 2(a1)
+    sh          s1, 6(a1)
+    sh          s2, 10(a1)
+    sh          s3, 14(a1)
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, s7, t8    /* ac0 += t12*  4433 + t13* 10703 */
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, s7, t9    /* ac1 += t12*-10704 + t13*  4433 */
+    sra         s4, s5, 16      /* tmp4 = t11 */
+    addiu       a1, a1, 16
+    addiu       s8, s8, -1
+    extr_r.w    s0, $ac0, 11    /* tmp0 = (ac0 + 1024) >> 11 */
+    extr_r.w    s1, $ac1, 11    /* tmp1 = (ac1 + 1024) >> 11 */
+    addu        s2, s5, s4      /* tmp2 = t10 + t11 */
+    subu        s3, s5, s4      /* tmp3 = t10 - t11 */
+    sll         s2, s2, 2       /* tmp2 = (t10 + t11) << 2 */
+    sll         s3, s3, 2       /* tmp3 = (t10 - t11) << 2 */
+    sh          s2, -16(a1)
+    sh          s3, -8(a1)
+    sh          s0, -12(a1)
+    bgtz        s8, 1b
+     sh         s1, -4(a1)
+    li          t0, 2260
+    li          t1, 11363
+    li          t2, 9633
+    li          t3, 6436
+    li          t4, 6437
+    li          t5, 2261
+    li          t6, 11362
+    li          t7, 2259
+    li          t8, 4433
+    li          t9, 10703
+    li          a1, 10704
+    li          s8, 8
+
+2:
+    lh          a2, 0(a0)       /* 0 */
+    lh          a3, 16(a0)      /* 8 */
+    lh          v0, 32(a0)      /* 16 */
+    lh          v1, 48(a0)      /* 24 */
+    lh          s4, 64(a0)      /* 32 */
+    lh          s5, 80(a0)      /* 40 */
+    lh          s6, 96(a0)      /* 48 */
+    lh          s7, 112(a0)     /* 56 */
+    addu        s2, v0, s5      /* tmp2 = 16 + 40 */
+    subu        s5, v0, s5      /* tmp5 = 16 - 40 */
+    addu        s3, v1, s4      /* tmp3 = 24 + 32 */
+    subu        s4, v1, s4      /* tmp4 = 24 - 32 */
+    addu        s0, a2, s7      /* tmp0 =  0 + 56 */
+    subu        s7, a2, s7      /* tmp7 =  0 - 56 */
+    addu        s1, a3, s6      /* tmp1 =  8 + 48 */
+    subu        s6, a3, s6      /* tmp6 =  8 - 48 */
+    addu        a2, s0, s3      /* tmp10 = tmp0 + tmp3 */
+    subu        v1, s0, s3      /* tmp13 = tmp0 - tmp3 */
+    addu        a3, s1, s2      /* tmp11 = tmp1 + tmp2 */
+    subu        v0, s1, s2      /* tmp12 = tmp1 - tmp2 */
+    mult        s7, t1          /* ac0  = tmp7 * c1 */
+    madd        s4, t0          /* ac0 += tmp4 * c0 */
+    madd        s5, t4          /* ac0 += tmp5 * c4 */
+    madd        s6, t2          /* ac0 += tmp6 * c2 */
+    mult        $ac1, s7, t2    /* ac1  = tmp7 * c2 */
+    msub        $ac1, s4, t3    /* ac1 -= tmp4 * c3 */
+    msub        $ac1, s5, t6    /* ac1 -= tmp5 * c6 */
+    msub        $ac1, s6, t7    /* ac1 -= tmp6 * c7 */
+    mult        $ac2, s7, t4    /* ac2  = tmp7 * c4 */
+    madd        $ac2, s4, t2    /* ac2 += tmp4 * c2 */
+    madd        $ac2, s5, t5    /* ac2 += tmp5 * c5 */
+    msub        $ac2, s6, t6    /* ac2 -= tmp6 * c6 */
+    mult        $ac3, s7, t0    /* ac3  = tmp7 * c0 */
+    msub        $ac3, s4, t1    /* ac3 -= tmp4 * c1 */
+    madd        $ac3, s5, t2    /* ac3 += tmp5 * c2 */
+    msub        $ac3, s6, t3    /* ac3 -= tmp6 * c3 */
+    extr_r.w    s0, $ac0, 15    /* tmp0 = (ac0 + 16384) >> 15 */
+    extr_r.w    s1, $ac1, 15    /* tmp1 = (ac1 + 16384) >> 15 */
+    extr_r.w    s2, $ac2, 15    /* tmp2 = (ac2 + 16384) >> 15 */
+    extr_r.w    s3, $ac3, 15    /* tmp3 = (ac3 + 16384) >> 15 */
+    addiu       s8, s8, -1
+    addu        s4, a2, a3      /* tmp4 = tmp10 + tmp11 */
+    subu        s5, a2, a3      /* tmp5 = tmp10 - tmp11 */
+    sh          s0, 16(a0)
+    sh          s1, 48(a0)
+    sh          s2, 80(a0)
+    sh          s3, 112(a0)
+    mult        v0, t8          /* ac0  = tmp12 * c8 */
+    madd        v1, t9          /* ac0 += tmp13 * c9 */
+    mult        $ac1, v1, t8    /* ac1  = tmp13 * c8 */
+    msub        $ac1, v0, a1    /* ac1 -= tmp12 * c10 */
+    addiu       a0, a0, 2
+    extr_r.w    s6, $ac0, 15    /* tmp6 = (ac0 + 16384) >> 15 */
+    extr_r.w    s7, $ac1, 15    /* tmp7 = (ac1 + 16384) >> 15 */
+    shra_r.w    s4, s4, 2       /* tmp4 = (tmp4 + 2) >> 2 */
+    shra_r.w    s5, s5, 2       /* tmp5 = (tmp5 + 2) >> 2 */
+    sh          s4, -2(a0)
+    sh          s5, 62(a0)
+    sh          s6, 30(a0)
+    bgtz        s8, 2b
+     sh         s7, 94(a0)
+
+    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    jr          ra
+     nop
+
+END(jsimd_fdct_islow_dspr2)
+
+
+/**************************************************************************/
+LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
+/*
+ * a0 = data
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 8, s0, s1
+
+    li          a1, 0x014e014e  /* FIX_1_306562965 (334 << 16) |
+                                                   (334 & 0xffff) */
+    li          a2, 0x008b008b  /* FIX_0_541196100 (139 << 16) |
+                                                   (139 & 0xffff) */
+    li          a3, 0x00620062  /* FIX_0_382683433 (98 << 16) |
+                                                   (98 & 0xffff) */
+    li          s1, 0x00b500b5  /* FIX_0_707106781 (181 << 16) |
+                                                   (181 & 0xffff) */
+
+    move        v0, a0
+    addiu       v1, v0, 128     /* end address */
+
+0:
+    lw          t0, 0(v0)       /* tmp0 = 1|0 */
+    lw          t1, 4(v0)       /* tmp1 = 3|2 */
+    lw          t2, 8(v0)       /* tmp2 = 5|4 */
+    lw          t3, 12(v0)      /* tmp3 = 7|6 */
+    packrl.ph   t1, t1, t1      /* tmp1 = 2|3 */
+    packrl.ph   t3, t3, t3      /* tmp3 = 6|7 */
+    subq.ph     t7, t1, t2      /* tmp7 = 2-5|3-4 = t5|t4 */
+    subq.ph     t5, t0, t3      /* tmp5 = 1-6|0-7 = t6|t7 */
+    addq.ph     t6, t1, t2      /* tmp6 = 2+5|3+4 = t2|t3 */
+    addq.ph     t4, t0, t3      /* tmp4 = 1+6|0+7 = t1|t0 */
+    addq.ph     t8, t4, t6      /* tmp5 = t1+t2|t0+t3 = t11|t10 */
+    subq.ph     t9, t4, t6      /* tmp7 = t1-t2|t0-t3 = t12|t13 */
+    sra         t4, t8, 16      /* tmp4 = t11 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t9, s1
+    mult        $ac1, $0, $0    /* ac1  = 0 */
+    dpa.w.ph    $ac1, t7, a3    /* ac1 += t4*98 + t5*98 */
+    dpsx.w.ph   $ac1, t5, a3    /* ac1 += t6*98 + t7*98 */
+    mult        $ac2, $0, $0    /* ac2  = 0 */
+    dpa.w.ph    $ac2, t7, a2    /* ac2 += t4*139 + t5*139 */
+    mult        $ac3, $0, $0    /* ac3  = 0 */
+    dpa.w.ph    $ac3, t5, a1    /* ac3 += t6*334 + t7*334 */
+    precrq.ph.w t0, t5, t7      /* t0 = t5|t6 */
+    addq.ph     t2, t8, t4      /* tmp2 = t10 + t11 */
+    subq.ph     t3, t8, t4      /* tmp3 = t10 - t11 */
+    extr.w      t4, $ac0, 8
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t0, s1    /* ac0 += t5*181 + t6*181 */
+    extr.w      t0, $ac1, 8     /* t0 = z5 */
+    extr.w      t1, $ac2, 8     /* t1 = MULTIPLY(tmp10, 139) */
+    extr.w      t7, $ac3, 8     /* t2 = MULTIPLY(tmp12, 334) */
+    extr.w      t8, $ac0, 8     /* t8 = z3 = MULTIPLY(tmp11, 181) */
+    add         t6, t1, t0      /* t6 = z2 */
+    add         t7, t7, t0      /* t7 = z4 */
+    subq.ph     t0, t5, t8      /* t0 = z13 = tmp7 - z3 */
+    addq.ph     t8, t5, t8      /* t9 = z11 = tmp7 + z3 */
+    addq.ph     t1, t0, t6      /* t1 = z13 + z2 */
+    subq.ph     t6, t0, t6      /* t6 = z13 - z2 */
+    addq.ph     t0, t8, t7      /* t0 = z11 + z4 */
+    subq.ph     t7, t8, t7      /* t7 = z11 - z4 */
+    addq.ph     t5, t4, t9
+    subq.ph     t4, t9, t4
+    sh          t2, 0(v0)
+    sh          t5, 4(v0)
+    sh          t3, 8(v0)
+    sh          t4, 12(v0)
+    sh          t1, 10(v0)
+    sh          t6, 6(v0)
+    sh          t0, 2(v0)
+    sh          t7, 14(v0)
+    addiu       v0, 16
+    bne         v1, v0, 0b
+     nop
+    move        v0, a0
+    addiu       v1, v0, 16
+
+1:
+    lh          t0, 0(v0)       /* 0 */
+    lh          t1, 16(v0)      /* 8 */
+    lh          t2, 32(v0)      /* 16 */
+    lh          t3, 48(v0)      /* 24 */
+    lh          t4, 64(v0)      /* 32 */
+    lh          t5, 80(v0)      /* 40 */
+    lh          t6, 96(v0)      /* 48 */
+    lh          t7, 112(v0)     /* 56 */
+    add         t8, t0, t7      /* t8 = tmp0 */
+    sub         t7, t0, t7      /* t7 = tmp7 */
+    add         t0, t1, t6      /* t0 = tmp1 */
+    sub         t1, t1, t6      /* t1 = tmp6 */
+    add         t6, t2, t5      /* t6 = tmp2 */
+    sub         t5, t2, t5      /* t5 = tmp5 */
+    add         t2, t3, t4      /* t2 = tmp3 */
+    sub         t3, t3, t4      /* t3 = tmp4 */
+    add         t4, t8, t2      /* t4 = tmp10 = tmp0 + tmp3 */
+    sub         t8, t8, t2      /* t8 = tmp13 = tmp0 - tmp3 */
+    sub         s0, t0, t6      /* s0 = tmp12 = tmp1 - tmp2 */
+    ins         t8, s0, 16, 16  /* t8 = tmp12|tmp13 */
+    add         t2, t0, t6      /* t2 = tmp11 = tmp1 + tmp2 */
+    mult        $0, $0          /* ac0  = 0 */
+    dpa.w.ph    $ac0, t8, s1    /* ac0 += t12*181 + t13*181 */
+    add         s0, t4, t2      /* t8 = tmp10+tmp11 */
+    sub         t4, t4, t2      /* t4 = tmp10-tmp11 */
+    sh          s0, 0(v0)
+    sh          t4, 64(v0)
+    extr.w      t2, $ac0, 8     /* z1 = MULTIPLY(tmp12+tmp13,
+                                                 FIX_0_707106781) */
+    addq.ph     t4, t8, t2      /* t9 = tmp13 + z1 */
+    subq.ph     t8, t8, t2      /* t2 = tmp13 - z1 */
+    sh          t4, 32(v0)
+    sh          t8, 96(v0)
+    add         t3, t3, t5      /* t3 = tmp10 = tmp4 + tmp5 */
+    add         t0, t5, t1      /* t0 = tmp11 = tmp5 + tmp6 */
+    add         t1, t1, t7      /* t1 = tmp12 = tmp6 + tmp7 */
+    andi        t4, a1, 0xffff
+    mul         s0, t1, t4
+    sra         s0, s0, 8       /* s0 = z4 =
+                                     MULTIPLY(tmp12, FIX_1_306562965) */
+    ins         t1, t3, 16, 16  /* t1 = tmp10|tmp12 */
+    mult        $0, $0          /* ac0  = 0 */
+    mulsa.w.ph  $ac0, t1, a3    /* ac0 += t10*98 - t12*98 */
+    extr.w      t8, $ac0, 8     /* z5 = MULTIPLY(tmp10-tmp12,
+                                                 FIX_0_382683433) */
+    add         t2, t7, t8      /* t2 = tmp7 + z5 */
+    sub         t7, t7, t8      /* t7 = tmp7 - z5 */
+    andi        t4, a2, 0xffff
+    mul         t8, t3, t4
+    sra         t8, t8, 8       /* t8 = z2 =
+                                     MULTIPLY(tmp10, FIX_0_541196100) */
+    andi        t4, s1, 0xffff
+    mul         t6, t0, t4
+    sra         t6, t6, 8       /* t6 = z3 =
+                                     MULTIPLY(tmp11, FIX_0_707106781) */
+    add         t0, t6, t8      /* t0 = z3 + z2 */
+    sub         t1, t6, t8      /* t1 = z3 - z2 */
+    add         t3, t6, s0      /* t3 = z3 + z4 */
+    sub         t4, t6, s0      /* t4 = z3 - z4 */
+    sub         t5, t2, t1      /* t5 = dataptr[5] */
+    sub         t6, t7, t0      /* t6 = dataptr[3] */
+    add         t3, t2, t3      /* t3 = dataptr[1] */
+    add         t4, t7, t4      /* t4 = dataptr[7] */
+    sh          t5, 80(v0)
+    sh          t6, 48(v0)
+    sh          t3, 16(v0)
+    sh          t4, 112(v0)
+    addiu       v0, 2
+    bne         v0, v1, 1b
+     nop
+
+    RESTORE_REGS_FROM_STACK 8, s0, s1
+
+    j           ra
+     nop
+END(jsimd_fdct_ifast_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 16, s0, s1, s2
+
+    addiu       v0, a2, 124     /* v0 = workspace_end */
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    lh          t2, 128(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    lh          t8, 386(a1)
+
+1:
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    sh          v1, 0(a0)
+
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    lh          t0, 0(a2)
+    lh          t1, 0(a1)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    lh          t2, 128(a1)
+    lh          t4, 384(a1)
+    lh          t5, 130(a1)
+    lh          t8, 386(a1)
+    lh          t6, 2(a2)
+    lh          t7, 2(a1)
+    sh          s2, 2(a0)
+    lh          t0, 0(a2)
+    sra         t3, t0, 15
+    sll         t3, t3, 1
+    addiu       t3, t3, 1
+    mul         t0, t0, t3
+    bne         a2, v0, 1b
+     addiu      a0, a0, 4
+
+    andi        t1, 0xffff
+    add         t9, t0, t2
+    andi        t9, 0xffff
+    mul         v1, t9, t1
+    sra         s0, t6, 15
+    sll         s0, s0, 1
+    addiu       s0, s0, 1
+    addiu       t9, t4, 16
+    srav        v1, v1, t9
+    mul         v1, v1, t3
+    mul         t6, t6, s0
+    andi        t7, 0xffff
+    sh          v1, 0(a0)
+    add         s1, t6, t5
+    andi        s1, 0xffff
+    mul         s2, s1, t7
+    addiu       s1, t8, 16
+    addiu       a2, a2, 4
+    addiu       a1, a1, 4
+    srav        s2, s2, s1
+    mul         s2, s2, s0
+    sh          s2, 2(a0)
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
+
+    j           ra
+     nop
+
+END(jsimd_quantize_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_quantize_float_dspr2)
+/*
+ * a0 = coef_block
+ * a1 = divisors
+ * a2 = workspace
+ */
+    .set at
+
+    li          t1, 0x46800100  /* integer representation 16384.5 */
+    mtc1        t1, f0
+    li          t0, 63
+0:
+    lwc1        f2, 0(a2)
+    lwc1        f10, 0(a1)
+    lwc1        f4, 4(a2)
+    lwc1        f12, 4(a1)
+    lwc1        f6, 8(a2)
+    lwc1        f14, 8(a1)
+    lwc1        f8, 12(a2)
+    lwc1        f16, 12(a1)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    lwc1        f10, 16(a1)
+    lwc1        f12, 20(a1)
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    lwc1        f14, 24(a1)
+    lwc1        f16, 28(a1)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    lwc1        f2, 16(a2)
+    lwc1        f4, 20(a2)
+    lwc1        f6, 24(a2)
+    lwc1        f8, 28(a2)
+    madd.s      f2, f0, f2, f10
+    madd.s      f4, f0, f4, f12
+    madd.s      f6, f0, f6, f14
+    madd.s      f8, f0, f8, f16
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    trunc.w.s   f2, f2
+    trunc.w.s   f4, f4
+    trunc.w.s   f6, f6
+    trunc.w.s   f8, f8
+    sh          t1, 0(a0)
+    sh          t2, 2(a0)
+    sh          t3, 4(a0)
+    sh          t4, 6(a0)
+    mfc1        t1, f2
+    mfc1        t2, f4
+    mfc1        t3, f6
+    mfc1        t4, f8
+    addiu       t0, t0, -8
+    addiu       a2, a2, 32
+    addiu       a1, a1, 32
+    addiu       t1, t1, -16384
+    addiu       t2, t2, -16384
+    addiu       t3, t3, -16384
+    addiu       t4, t4, -16384
+    sh          t1, 8(a0)
+    sh          t2, 10(a0)
+    sh          t3, 12(a0)
+    sh          t4, 14(a0)
+    bgez        t0, 0b
+     addiu      a0, a0, 16
+
+    j           ra
+     nop
+
+END(jsimd_quantize_float_dspr2)
+
+#endif
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_2x2_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
+
+    addiu       sp, sp, -40
+    move        v0, sp
+    addiu       s2, zero, 29692
+    addiu       s3, zero, -10426
+    addiu       s4, zero, 6967
+    addiu       s5, zero, -5906
+    lh          t0, 0(a1)       /* t0 = inptr[DCTSIZE*0] */
+    lh          t5, 0(a0)       /* t5 = quantptr[DCTSIZE*0] */
+    lh          t1, 48(a1)      /* t1 = inptr[DCTSIZE*3] */
+    lh          t6, 48(a0)      /* t6 = quantptr[DCTSIZE*3] */
+    mul         t4, t5, t0
+    lh          t0, 16(a1)      /* t0 = inptr[DCTSIZE*1] */
+    lh          t5, 16(a0)      /* t5 = quantptr[DCTSIZE*1] */
+    mul         t6, t6, t1
+    mul         t5, t5, t0
+    lh          t2, 80(a1)      /* t2 = inptr[DCTSIZE*5] */
+    lh          t7, 80(a0)      /* t7 = quantptr[DCTSIZE*5] */
+    lh          t3, 112(a1)     /* t3 = inptr[DCTSIZE*7] */
+    lh          t8, 112(a0)     /* t8 = quantptr[DCTSIZE*7] */
+    mul         t7, t7, t2
+    mult        zero, zero
+    mul         t8, t8, t3
+    li          s0, 0x73FCD746  /* s0 = (29692 << 16) | (-10426 & 0xffff) */
+    li          s1, 0x1B37E8EE  /* s1 = (6967 << 16) | (-5906 & 0xffff) */
+    ins         t6, t5, 16, 16  /* t6 = t5|t6 */
+    sll         t4, t4, 15
+    dpa.w.ph    $ac0, t6, s0
+    lh          t1, 2(a1)
+    lh          t6, 2(a0)
+    ins         t8, t7, 16, 16  /* t8 = t7|t8 */
+    dpa.w.ph    $ac0, t8, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 18(a1)
+    lh          t6, 18(a0)
+    lh          t2, 50(a1)
+    lh          t7, 50(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 82(a1)
+    lh          t2, 82(a0)
+    lh          t3, 114(a1)
+    lh          t4, 114(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 0(v0)
+    sw          t8, 20(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 6(a1)
+    lh          t6, 6(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 22(a1)
+    lh          t6, 22(a0)
+    lh          t2, 54(a1)
+    lh          t7, 54(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 86(a1)
+    lh          t2, 86(a0)
+    lh          t3, 118(a1)
+    lh          t4, 118(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 4(v0)
+    sw          t8, 24(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 10(a1)
+    lh          t6, 10(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 26(a1)
+    lh          t6, 26(a0)
+    lh          t2, 58(a1)
+    lh          t7, 58(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 90(a1)
+    lh          t2, 90(a0)
+    lh          t3, 122(a1)
+    lh          t4, 122(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 8(v0)
+    sw          t8, 28(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    lh          t1, 14(a1)
+    lh          t6, 14(a0)
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    mul         t5, t6, t1
+    lh          t1, 30(a1)
+    lh          t6, 30(a0)
+    lh          t2, 62(a1)
+    lh          t7, 62(a0)
+    mul         t6, t6, t1
+    subu        t8, t4, t0
+    mul         t7, t7, t2
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    lh          t1, 94(a1)
+    lh          t2, 94(a0)
+    lh          t3, 126(a1)
+    lh          t4, 126(a0)
+    shra_r.w    t8, t8, 13
+    mul         t1, t1, t2
+    mul         t3, t3, t4
+    sw          t0, 12(v0)
+    sw          t8, 32(v0)
+    sll         t4, t5, 15
+    ins         t7, t6, 16, 16
+    mult        zero, zero
+    dpa.w.ph    $ac0, t7, s0
+    ins         t3, t1, 16, 16
+    dpa.w.ph    $ac0, t3, s1
+    mflo        t0, $ac0
+    lw          t9, 0(a2)
+    lw          t3, 0(v0)
+    lw          t7, 4(v0)
+    lw          t1, 8(v0)
+    addu        t9, t9, a3
+    sll         t3, t3, 15
+    subu        t8, t4, t0
+    addu        t0, t4, t0
+    shra_r.w    t0, t0, 13
+    shra_r.w    t8, t8, 13
+    sw          t0, 16(v0)
+    sw          t8, 36(v0)
+    lw          t5, 12(v0)
+    lw          t6, 16(v0)
+    mult        t7, s2
+    madd        t1, s3
+    madd        t5, s4
+    madd        t6, s5
+    lw          t5, 24(v0)
+    lw          t7, 28(v0)
+    mflo        t0, $ac0
+    lw          t8, 32(v0)
+    lw          t2, 36(v0)
+    mult        $ac1, t5, s2
+    madd        $ac1, t7, s3
+    madd        $ac1, t8, s4
+    madd        $ac1, t2, s5
+    addu        t1, t3, t0
+    subu        t6, t3, t0
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    mflo        t4, $ac1
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    lw          t0, 20(v0)
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    sll         t0, t0, 15
+    lw          t9, 4(a2)
+    addu        t1, t0, t4
+    subu        t6, t0, t4
+    addu        t9, t9, a3
+    shra_r.w    t1, t1, 20
+    shra_r.w    t6, t6, 20
+    shll_s.w    t1, t1, 24
+    shll_s.w    t6, t6, 24
+    sra         t1, t1, 24
+    sra         t6, t6, 24
+    addiu       t1, t1, 128
+    addiu       t6, t6, 128
+    sb          t1, 0(t9)
+    sb          t6, 1(t9)
+    addiu       sp, sp, 40
+
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
+
+    j           ra
+     nop
+
+END(jsimd_idct_2x2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_4x4_dspr2)
+/*
+ * a0     = compptr->dct_table
+ * a1     = coef_block
+ * a2     = output_buf
+ * a3     = output_col
+ * 16(sp) = workspace[DCTSIZE*4]  (buffers data between passes)
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw          v1, 48(sp)
+    move        t0, a1
+    move        t1, v1
+    li          t9, 4
+    li          s0, 0x2e75f93e
+    li          s1, 0x21f9ba79
+    li          s2, 0xecc2efb0
+    li          s3, 0x52031ccd
+
+0:
+    lh          s6, 32(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 32(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 96(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 96(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 0(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 0(a0)       /* quantptr[0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 112(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 112(a0)     /* quantptr[DCTSIZE*7] */
+    lh          v0, 80(t0)      /* inptr[DCTSIZE*5] */
+    lh          s5, 80(a0)      /* quantptr[DCTSIZE*5] */
+    lh          s6, 48(a0)      /* quantptr[DCTSIZE*3] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s7, 16(a0)      /* quantptr[DCTSIZE*1] */
+    lh          t8, 16(t0)      /* inptr[DCTSIZE*1] */
+    subu        t6, t6, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 48(t0)      /* inptr[DCTSIZE*3] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         v0, s5, v0      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, t6      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, t6      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, v0, 16, 16
+    ins         t7, t8, 16, 16
+    addiu       t9, t9, -1
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        s4, $ac0
+    mflo        s5, $ac1
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addiu       t0, t0, 2
+    addu        t6, t4, s4
+    subu        t5, t4, s4
+    addu        s6, t3, s5
+    subu        s7, t3, s5
+    shra_r.w    t6, t6, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    t5, t5, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          t6, 28(t1)
+    sw          t5, 60(t1)
+    sw          s6, -4(t1)
+    bgtz        t9, 0b
+     sw         s7, 92(t1)
+    /* second loop three pass */
+    li          t9, 3
+1:
+    lh          s6, 34(t0)      /* inptr[DCTSIZE*2] */
+    lh          t6, 34(a0)      /* quantptr[DCTSIZE*2] */
+    lh          s7, 98(t0)      /* inptr[DCTSIZE*6] */
+    lh          t7, 98(a0)      /* quantptr[DCTSIZE*6] */
+    mul         t6, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          s4, 2(t0)       /* inptr[DCTSIZE*0] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s5, 2(a0)       /* quantptr[DCTSIZE*0] */
+    li          s6, 15137
+    li          s7, 6270
+    mul         t2, s4, s5      /* tmp0 = (inptr[0] * quantptr[0]) */
+    mul         v0, s6, t6      /* z2 = (inptr[DCTSIZE*2] *
+                                         quantptr[DCTSIZE*2]) */
+    lh          t5, 114(t0)     /* inptr[DCTSIZE*7] */
+    mul         t7, s7, t7      /* z3 = (inptr[DCTSIZE*6] *
+                                         quantptr[DCTSIZE*6]) */
+    lh          s4, 114(a0)     /* quantptr[DCTSIZE*7] */
+    lh          s5, 82(a0)      /* quantptr[DCTSIZE*5] */
+    lh          t6, 82(t0)      /* inptr[DCTSIZE*5] */
+    sll         t2, t2, 14      /* tmp0 <<= (CONST_BITS+1) */
+    lh          s6, 50(a0)      /* quantptr[DCTSIZE*3] */
+    lh          t8, 18(t0)      /* inptr[DCTSIZE*1] */
+    subu        v0, v0, t7      /* tmp2 =
+                                     MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
+    lh          t7, 50(t0)      /* inptr[DCTSIZE*3] */
+    lh          s7, 18(a0)      /* quantptr[DCTSIZE*1] */
+    mul         t5, s4, t5      /* z1 = (inptr[DCTSIZE*7] *
+                                         quantptr[DCTSIZE*7]) */
+    mul         t6, s5, t6      /* z2 = (inptr[DCTSIZE*5] *
+                                         quantptr[DCTSIZE*5]) */
+    mul         t7, s6, t7      /* z3 = (inptr[DCTSIZE*3] *
+                                         quantptr[DCTSIZE*3]) */
+    mul         t8, s7, t8      /* z4 = (inptr[DCTSIZE*1] *
+                                         quantptr[DCTSIZE*1]) */
+    addu        t3, t2, v0      /* tmp10 = tmp0 + z2 */
+    subu        t4, t2, v0      /* tmp10 = tmp0 - z2 */
+    mult        $ac0, zero, zero
+    mult        $ac1, zero, zero
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    mflo        t5, $ac0
+    mflo        t6, $ac1
+    addiu       t9, t9, -1
+    addiu       t0, t0, 2
+    addiu       a0, a0, 2
+    addiu       t1, t1, 4
+    addu        s5, t4, t5
+    subu        s4, t4, t5
+    addu        s6, t3, t6
+    subu        s7, t3, t6
+    shra_r.w    s5, s5, 12      /* DESCALE(tmp12 + temp1, 12) */
+    shra_r.w    s4, s4, 12      /* DESCALE(tmp12 - temp1, 12) */
+    shra_r.w    s6, s6, 12      /* DESCALE(tmp10 + temp2, 12) */
+    shra_r.w    s7, s7, 12      /* DESCALE(tmp10 - temp2, 12) */
+    sw          s5, 32(t1)
+    sw          s4, 64(t1)
+    sw          s6, 0(t1)
+    bgtz        t9, 1b
+     sw         s7, 96(t1)
+    move        t1, v1
+    li          s4, 15137
+    lw          s6, 8(t1)       /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 24(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 0(t1)       /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 28(t1)      /* wsptr[7] */
+    lh          t6, 20(t1)      /* wsptr[5] */
+    lh          t7, 12(t1)      /* wsptr[3] */
+    lh          t8, 4(t1)       /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 0(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 2 */
+    li          s4, 15137
+    lw          s6, 40(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 56(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 32(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 60(t1)      /* wsptr[7] */
+    lh          t6, 52(t1)      /* wsptr[5] */
+    lh          t7, 44(t1)      /* wsptr[3] */
+    lh          t8, 36(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1,
+                                           CONST_BITS-PASS1_BITS+1) */
+    sll         s4, t9, 2
+    lw          v0, 4(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    /* 3 */
+    li          s4, 15137
+    lw          s6, 72(t1)      /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 88(t1)      /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 64(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 92(t1)      /* wsptr[7] */
+    lh          t6, 84(t1)      /* wsptr[5] */
+    lh          t7, 76(t1)      /* wsptr[3] */
+    lh          t8, 68(t1)      /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2 */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2 */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 8(a2)       /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+    li          s4, 15137
+    lw          s6, 104(t1)     /* wsptr[2] */
+    li          s5, 6270
+    lw          s7, 120(t1)     /* wsptr[6] */
+    mul         s4, s4, s6      /* MULTIPLY((JLONG)wsptr[2],
+                                            FIX_1_847759065) */
+    lw          t2, 96(t1)      /* wsptr[0] */
+    mul         s5, s5, s7      /* MULTIPLY((JLONG)wsptr[6],
+                                            -FIX_0_765366865) */
+    lh          t5, 124(t1)     /* wsptr[7] */
+    lh          t6, 116(t1)     /* wsptr[5] */
+    lh          t7, 108(t1)     /* wsptr[3] */
+    lh          t8, 100(t1)     /* wsptr[1] */
+    ins         t5, t6, 16, 16
+    ins         t7, t8, 16, 16
+    mult        $ac0, zero, zero
+    dpa.w.ph    $ac0, t5, s0
+    dpa.w.ph    $ac0, t7, s1
+    mult        $ac1, zero, zero
+    dpa.w.ph    $ac1, t5, s2
+    dpa.w.ph    $ac1, t7, s3
+    sll         t2, t2, 14      /* tmp0 =
+                                     ((JLONG)wsptr[0]) << (CONST_BITS+1) */
+    mflo        s6, $ac0
+    /* MULTIPLY(wsptr[2], FIX_1_847759065) +
+       MULTIPLY(wsptr[6], -FIX_0_765366865) */
+    subu        s4, s4, s5
+    addu        t3, t2, s4      /* tmp10 = tmp0 + z2; */
+    mflo        s7, $ac1
+    subu        t4, t2, s4      /* tmp10 = tmp0 - z2; */
+    addu        t7, t4, s6
+    subu        t8, t4, s6
+    addu        t5, t3, s7
+    subu        t6, t3, s7
+    shra_r.w    t5, t5, 19      /* DESCALE(tmp10 + temp2, 19) */
+    shra_r.w    t6, t6, 19      /* DESCALE(tmp10 - temp2, 19) */
+    shra_r.w    t7, t7, 19      /* DESCALE(tmp12 + temp1, 19) */
+    shra_r.w    t8, t8, 19      /* DESCALE(tmp12 - temp1, 19) */
+    sll         s4, t9, 2
+    lw          v0, 12(a2)      /* output_buf[ctr] */
+    shll_s.w    t5, t5, 24
+    shll_s.w    t6, t6, 24
+    shll_s.w    t7, t7, 24
+    shll_s.w    t8, t8, 24
+    sra         t5, t5, 24
+    sra         t6, t6, 24
+    sra         t7, t7, 24
+    sra         t8, t8, 24
+    addu        v0, v0, a3      /* outptr = output_buf[ctr] + output_col */
+    addiu       t5, t5, 128
+    addiu       t6, t6, 128
+    addiu       t7, t7, 128
+    addiu       t8, t8, 128
+    sb          t5, 0(v0)
+    sb          t7, 1(v0)
+    sb          t8, 2(v0)
+    sb          t6, 3(v0)
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+END(jsimd_idct_4x4_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_6x6_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = output_buf
+ * a3 = output_col
+ */
+    .set at
+
+    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    addiu       sp, sp, -144
+    move        v0, sp
+    addiu       v1, v0, 24
+    addiu       t9, zero, 5793
+    addiu       s0, zero, 10033
+    addiu       s1, zero, 2998
+
+1:
+    lh          s2, 0(a0)       /* q0 = quantptr[ 0] */
+    lh          s3, 32(a0)      /* q1 = quantptr[16] */
+    lh          s4, 64(a0)      /* q2 = quantptr[32] */
+    lh          t2, 64(a1)      /* tmp2 = inptr[32] */
+    lh          t1, 32(a1)      /* tmp1 = inptr[16] */
+    lh          t0, 0(a1)       /* tmp0 = inptr[ 0] */
+    mul         t2, t2, s4      /* tmp2 = tmp2 * q2 */
+    mul         t1, t1, s3      /* tmp1 = tmp1 * q1 */
+    mul         t0, t0, s2      /* tmp0 = tmp0 * q0 */
+    lh          t6, 16(a1)      /* z1 = inptr[ 8] */
+    lh          t8, 80(a1)      /* z3 = inptr[40] */
+    lh          t7, 48(a1)      /* z2 = inptr[24] */
+    lh          s2, 16(a0)      /* q0 = quantptr[ 8] */
+    lh          s4, 80(a0)      /* q2 = quantptr[40] */
+    lh          s3, 48(a0)      /* q1 = quantptr[24] */
+    mul         t2, t2, t9      /* tmp2 = tmp2 * 5793 */
+    mul         t1, t1, s0      /* tmp1 = tmp1 * 10033 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    mul         t6, t6, s2      /* z1 = z1 * q0 */
+    mul         t8, t8, s4      /* z3 = z3 * q2 */
+    mul         t7, t7, s3      /* z2 = z2 * q1 */
+    addu        t3, t0, t2      /* tmp10 = tmp0 + tmp2 */
+    sll         t2, t2, 1       /* tmp2 = tmp2 << 2 */
+    subu        t4, t0, t2      /* tmp11 = tmp0 - tmp2; */
+    subu        t5, t3, t1      /* tmp12 = tmp10 - tmp1 */
+    addu        t3, t3, t1      /* tmp10 = tmp10 + tmp1 */
+    addu        t1, t6, t8      /* tmp1 = z1 + z3 */
+    mul         t1, t1, s1      /* tmp1 = tmp1 * 2998 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    subu        t2, t6, t8      /* tmp2 = z1 - z3 */
+    subu        t2, t2, t7      /* tmp2 = tmp2 - z2 */
+    sll         t2, t2, 2       /* tmp2 = tmp2 << 2 */
+    addu        t0, t6, t7      /* tmp0 = z1 + z2 */
+    sll         t0, t0, 13      /* tmp0 = tmp0 << 13 */
+    subu        s2, t8, t7      /* q0 = z3 - z2 */
+    sll         s2, s2, 13      /* q0 = q0 << 13 */
+    addu        t0, t0, t1      /* tmp0 = tmp0 + tmp1 */
+    addu        t1, s2, t1      /* tmp1 = q0 + tmp1 */
+    addu        s2, t4, t2      /* q0 = tmp11 + tmp2 */
+    subu        s3, t4, t2      /* q1 = tmp11 - tmp2 */
+    addu        t6, t3, t0      /* z1 = tmp10 + tmp0 */
+    subu        t7, t3, t0      /* z2 = tmp10 - tmp0 */
+    addu        t4, t5, t1      /* tmp11 = tmp12 + tmp1 */
+    subu        t5, t5, t1      /* tmp12 = tmp12 - tmp1 */
+    shra_r.w    t6, t6, 11      /* z1 = (z1 + 1024) >> 11 */
+    shra_r.w    t7, t7, 11      /* z2 = (z2 + 1024) >> 11 */
+    shra_r.w    t4, t4, 11      /* tmp11 = (tmp11 + 1024) >> 11 */
+    shra_r.w    t5, t5, 11      /* tmp12 = (tmp12 + 1024) >> 11 */
+    sw          s2, 24(v0)
+    sw          s3, 96(v0)
+    sw          t6, 0(v0)
+    sw          t7, 120(v0)
+    sw          t4, 48(v0)
+    sw          t5, 72(v0)
+    addiu       v0, v0, 4
+    addiu       a1, a1, 2
+    bne         v0, v1, 1b
+     addiu      a0, a0, 2
+
+    /* Pass 2: process 6 rows from work array, store into output array. */
+    move        v0, sp
+    addiu       v1, v0, 144
+
+2:
+    lw          t0, 0(v0)
+    lw          t2, 16(v0)
+    lw          s5, 0(a2)
+    addiu       t0, t0, 16
+    sll         t0, t0, 13
+    mul         t3, t2, t9
+    lw          t6, 4(v0)
+    lw          t8, 20(v0)
+    lw          t7, 12(v0)
+    addu        s5, s5, a3
+    addu        s6, t6, t8
+    mul         s6, s6, s1
+    addu        t1, t0, t3
+    subu        t4, t0, t3
+    subu        t4, t4, t3
+    lw          t3, 8(v0)
+    mul         t0, t3, s0
+    addu        s7, t6, t7
+    sll         s7, s7, 13
+    addu        s7, s6, s7
+    subu        t2, t8, t7
+    sll         t2, t2, 13
+    addu        t2, s6, t2
+    subu        s6, t6, t7
+    subu        s6, s6, t8
+    sll         s6, s6, 13
+    addu        t3, t1, t0
+    subu        t5, t1, t0
+    addu        t6, t3, s7
+    subu        t3, t3, s7
+    addu        t7, t4, s6
+    subu        t4, t4, s6
+    addu        t8, t5, t2
+    subu        t5, t5, t2
+    shll_s.w    t6, t6, 6
+    shll_s.w    t3, t3, 6
+    shll_s.w    t7, t7, 6
+    shll_s.w    t4, t4, 6
+    shll_s.w    t8, t8, 6
+    shll_s.w    t5, t5, 6
+    sra         t6, t6, 24
+    addiu       t6, t6, 128
+    sra         t3, t3, 24
+    addiu       t3, t3, 128
+    sb          t6, 0(s5)
+    sra         t7, t7, 24
+    addiu       t7, t7, 128
+    sb          t3, 5(s5)
+    sra         t4, t4, 24
+    addiu       t4, t4, 128
+    sb          t7, 1(s5)
+    sra         t8, t8, 24
+    addiu       t8, t8, 128
+    sb          t4, 4(s5)
+    addiu       v0, v0, 24
+    sra         t5, t5, 24
+    addiu       t5, t5, 128
+    sb          t8, 2(s5)
+    addiu       a2, a2,  4
+    bne         v0, v1, 2b
+     sb         t5, 3(s5)
+
+    addiu       sp, sp, 144
+
+    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
+
+    j           ra
+     nop
+
+END(jsimd_idct_6x6_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
+/*
+ * a0 = compptr->dct_table
+ * a1 = coef_block
+ * a2 = workspace
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 8
+
+1:
+    /* odd part */
+    lh          t0, 48(a1)
+    lh          t1, 48(a0)
+    lh          t2, 16(a1)
+    lh          t3, 16(a0)
+    lh          t4, 80(a1)
+    lh          t5, 80(a0)
+    lh          t6, 112(a1)
+    lh          t7, 112(a0)
+    mul         t0, t0, t1      /* z2 */
+    mul         t1, t2, t3      /* z1 */
+    mul         t2, t4, t5      /* z3 */
+    mul         t3, t6, t7      /* z4 */
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    li          t6, 7053        /* FIX(0.860918669) */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t7, t1, t2      /* tmp10 */
+    addu        t8, t7, t3      /* tmp10 + z4 */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2139        /* FIX(0.261052384) */
+    mul         t8, t7, t8      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    li          t7, 2295        /* FIX(0.280143716) */
+    mul         t7, t1, t7      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1-=z4 */
+    subu        t0, t0, t2      /* z2-=z3 */
+    addu        t2, t0, t1      /* z1+z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_0_765366865 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t8, t6, t8      /* tmp12 */
+    addu        t3, t8, t4      /* tmp12 + tmp11 */
+    addu        t3, t3, t7      /* tmp10 */
+    subu        t8, t8, t9      /* tmp12 + tmp13 */
+    addu        s0, t5, s0
+    subu        t8, t8, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        s1, s1, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    /* even part start */
+    lh          t4, 64(a1)
+    lh          t5, 64(a0)
+    lh          t7, 32(a1)
+    lh          s0, 32(a0)
+    lh          s1, 0(a1)
+    lh          s2, 0(a0)
+    lh          s3, 96(a1)
+    lh          v0, 96(a0)
+    mul         t4, t4, t5      /* DEQUANTIZE(inptr[DCTSIZE*4],
+                                              quantptr[DCTSIZE*4]) */
+    mul         t5, t7, s0      /* DEQUANTIZE(inptr[DCTSIZE*2],
+                                              quantptr[DCTSIZE*2]) */
+    mul         t7, s1, s2      /* DEQUANTIZE(inptr[DCTSIZE*0],
+                                              quantptr[DCTSIZE*0]) */
+    mul         s0, s3, v0      /* DEQUANTIZE(inptr[DCTSIZE*6],
+                                              quantptr[DCTSIZE*6]) */
+    /* odd part end */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* update counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 2
+    addiu       a1, a1, 2
+    /* even part rest */
+    li          s1, 10033
+    li          s2, 11190
+    mul         t4, t4, s1      /* z4 */
+    mul         s1, t5, s2      /* z4 */
+    sll         t5, t5, 13      /* z1 */
+    sll         t7, t7, 13
+    addiu       t7, t7, 1024    /* z3 */
+    sll         s0, s0, 13      /* z2 */
+    addu        s2, t7, t4      /* tmp10 */
+    subu        t4, t7, t4      /* tmp11 */
+    subu        s3, t5, s0      /* tmp12 */
+    addu        t2, t7, s3      /* tmp21 */
+    subu        s3, t7, s3      /* tmp24 */
+    addu        t7, s1, s0      /* tmp12 */
+    addu        v0, s2, t7      /* tmp20 */
+    subu        s2, s2, t7      /* tmp25 */
+    subu        s1, s1, t5      /* z4 - z1 */
+    subu        s1, s1, s0      /* tmp12 */
+    addu        s0, t4, s1      /* tmp22 */
+    subu        t4, t4, s1      /* tmp23 */
+    /* final output stage */
+    addu        t5, v0, t3
+    subu        v0, v0, t3
+    addu        t3, t2, t1
+    subu        t2, t2, t1
+    addu        t1, s0, t8
+    subu        s0, s0, t8
+    addu        t8, t4, t9
+    subu        t4, t4, t9
+    addu        t9, s3, t0
+    subu        s3, s3, t0
+    addu        t0, s2, t6
+    subu        s2, s2, t6
+    sra         t5, t5, 11
+    sra         t3, t3, 11
+    sra         t1, t1, 11
+    sra         t8, t8, 11
+    sra         t9, t9, 11
+    sra         t0, t0, 11
+    sra         s2, s2, 11
+    sra         s3, s3, 11
+    sra         t4, t4, 11
+    sra         s0, s0, 11
+    sra         t2, t2, 11
+    sra         v0, v0, 11
+    sw          t5, 0(a2)
+    sw          t3, 32(a2)
+    sw          t1, 64(a2)
+    sw          t8, 96(a2)
+    sw          t9, 128(a2)
+    sw          t0, 160(a2)
+    sw          s2, 192(a2)
+    sw          s3, 224(a2)
+    sw          t4, 256(a2)
+    sw          s0, 288(a2)
+    sw          t2, 320(a2)
+    sw          v0, 352(a2)
+    bgtz        a3, 1b
+     addiu      a2, a2, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    j           ra
+     nop
+
+END(jsimd_idct_12x12_pass1_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
+/*
+ * a0 = workspace
+ * a1 = output
+ */
+    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
+
+    li          a3, 12
+
+1:
+    /* Odd part */
+    lw          t0, 12(a0)
+    lw          t1, 4(a0)
+    lw          t2, 20(a0)
+    lw          t3, 28(a0)
+    li          t4, 10703       /* FIX(1.306562965) */
+    li          t5, 4433        /* FIX_0_541196100 */
+    mul         t4, t0, t4      /* tmp11 */
+    mul         t5, t0, t5      /* -tmp14 */
+    addu        t6, t1, t2      /* tmp10 */
+    li          t7, 2139        /* FIX(0.261052384) */
+    mul         t7, t6, t7      /* MULTIPLY(tmp10, FIX(0.261052384)) */
+    addu        t6, t6, t3      /* tmp10 + z4 */
+    li          t8, 7053        /* FIX(0.860918669) */
+    mul         t6, t6, t8      /* tmp15 */
+    li          t8, 2295        /* FIX(0.280143716) */
+    mul         t8, t1, t8      /* MULTIPLY(z1, FIX(0.280143716)) */
+    addu        t9, t2, t3      /* z3 + z4 */
+    li          s0, 8565        /* FIX(1.045510580) */
+    mul         t9, t9, s0      /* -tmp13 */
+    li          s0, 12112       /* FIX(1.478575242) */
+    mul         s0, t2, s0      /* MULTIPLY(z3, FIX(1.478575242)) */
+    li          s1, 12998       /* FIX(1.586706681) */
+    mul         s1, t3, s1      /* MULTIPLY(z4, FIX(1.586706681)) */
+    li          s2, 5540        /* FIX(0.676326758) */
+    mul         s2, t1, s2      /* MULTIPLY(z1, FIX(0.676326758)) */
+    li          s3, 16244       /* FIX(1.982889723) */
+    mul         s3, t3, s3      /* MULTIPLY(z4, FIX(1.982889723)) */
+    subu        t1, t1, t3      /* z1 -= z4 */
+    subu        t0, t0, t2      /* z2 -= z3 */
+    addu        t2, t1, t0      /* z1 + z2 */
+    li          t3, 4433        /* FIX_0_541196100 */
+    mul         t2, t2, t3      /* z3 */
+    li          t3, 6270        /* FIX_0_765366865 */
+    mul         t1, t1, t3      /* MULTIPLY(z1, FIX_0_765366865) */
+    li          t3, 15137       /* FIX_1_847759065 */
+    mul         t0, t0, t3      /* MULTIPLY(z2, FIX_1_847759065) */
+    addu        t3, t6, t7      /* tmp12 */
+    addu        t7, t3, t4
+    addu        t7, t7, t8      /* tmp10 */
+    subu        t3, t3, t9
+    subu        t3, t3, t5
+    subu        t3, t3, s0      /* tmp12 */
+    subu        t9, t6, t9
+    subu        t9, t9, t4
+    addu        t9, t9, s1      /* tmp13 */
+    subu        t6, t6, t5
+    subu        t6, t6, s2
+    subu        t6, t6, s3      /* tmp15 */
+    addu        t1, t2, t1      /* tmp11 */
+    subu        t0, t2, t0      /* tmp14 */
+    /* even part */
+    lw          t2, 16(a0)      /* z4 */
+    lw          t4, 8(a0)       /* z1 */
+    lw          t5, 0(a0)       /* z3 */
+    lw          t8, 24(a0)      /* z2 */
+    li          s0, 10033       /* FIX(1.224744871) */
+    li          s1, 11190       /* FIX(1.366025404) */
+    mul         t2, t2, s0      /* z4 */
+    mul         s0, t4, s1      /* z4 */
+    addiu       t5, t5, 0x10
+    sll         t5, t5, 13      /* z3 */
+    sll         t4, t4, 13      /* z1 */
+    sll         t8, t8, 13      /* z2 */
+    subu        s1, t4, t8      /* tmp12 */
+    addu        s2, t5, t2      /* tmp10 */
+    subu        t2, t5, t2      /* tmp11 */
+    addu        s3, t5, s1      /* tmp21 */
+    subu        s1, t5, s1      /* tmp24 */
+    addu        t5, s0, t8      /* tmp12 */
+    addu        v0, s2, t5      /* tmp20 */
+    subu        t5, s2, t5      /* tmp25 */
+    subu        t4, s0, t4
+    subu        t4, t4, t8      /* tmp12 */
+    addu        t8, t2, t4      /* tmp22 */
+    subu        t2, t2, t4      /* tmp23 */
+    /* increment counter and pointers */
+    addiu       a3, a3, -1
+    addiu       a0, a0, 32
+    /* Final stage */
+    addu        t4, v0, t7
+    subu        v0, v0, t7
+    addu        t7, s3, t1
+    subu        s3, s3, t1
+    addu        t1, t8, t3
+    subu        t8, t8, t3
+    addu        t3, t2, t9
+    subu        t2, t2, t9
+    addu        t9, s1, t0
+    subu        s1, s1, t0
+    addu        t0, t5, t6
+    subu        t5, t5, t6
+    sll         t4, t4, 4
+    sll         t7, t7, 4
+    sll         t1, t1, 4
+    sll         t3, t3, 4
+    sll         t9, t9, 4
+    sll         t0, t0, 4
+    sll         t5, t5, 4
+    sll         s1, s1, 4
+    sll         t2, t2, 4
+    sll         t8, t8, 4
+    sll         s3, s3, 4
+    sll         v0, v0, 4
+    shll_s.w    t4, t4, 2
+    shll_s.w    t7, t7, 2
+    shll_s.w    t1, t1, 2
+    shll_s.w    t3, t3, 2
+    shll_s.w    t9, t9, 2
+    shll_s.w    t0, t0, 2
+    shll_s.w    t5, t5, 2
+    shll_s.w    s1, s1, 2
+    shll_s.w    t2, t2, 2
+    shll_s.w    t8, t8, 2
+    shll_s.w    s3, s3, 2
+    shll_s.w    v0, v0, 2
+    srl         t4, t4, 24
+    srl         t7, t7, 24
+    srl         t1, t1, 24
+    srl         t3, t3, 24
+    srl         t9, t9, 24
+    srl         t0, t0, 24
+    srl         t5, t5, 24
+    srl         s1, s1, 24
+    srl         t2, t2, 24
+    srl         t8, t8, 24
+    srl         s3, s3, 24
+    srl         v0, v0, 24
+    lw          t6, 0(a1)
+    addiu       t4, t4, 0x80
+    addiu       t7, t7, 0x80
+    addiu       t1, t1, 0x80
+    addiu       t3, t3, 0x80
+    addiu       t9, t9, 0x80
+    addiu       t0, t0, 0x80
+    addiu       t5, t5, 0x80
+    addiu       s1, s1, 0x80
+    addiu       t2, t2, 0x80
+    addiu       t8, t8, 0x80
+    addiu       s3, s3, 0x80
+    addiu       v0, v0, 0x80
+    sb          t4, 0(t6)
+    sb          t7, 1(t6)
+    sb          t1, 2(t6)
+    sb          t3, 3(t6)
+    sb          t9, 4(t6)
+    sb          t0, 5(t6)
+    sb          t5, 6(t6)
+    sb          s1, 7(t6)
+    sb          t2, 8(t6)
+    sb          t8, 9(t6)
+    sb          s3, 10(t6)
+    sb          v0, 11(t6)
+    bgtz        a3, 1b
+     addiu      a1, a1, 4
+
+    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
+
+    jr          ra
+     nop
+
+END(jsimd_idct_12x12_pass2_dspr2)
+
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    lw            t0, 0(a0)
+    li            t7, 0xff80ff80
+    addu          t0, t0, a1
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    lw            t0, 4(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 0(a2)
+    usw           t4, 4(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 8(a2)
+    usw           t6, 12(a2)
+
+    lw            t0, 8(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 16(a2)
+    usw           t4, 20(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 24(a2)
+    usw           t6, 28(a2)
+
+    lw            t0, 12(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 32(a2)
+    usw           t4, 36(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 40(a2)
+    usw           t6, 44(a2)
+
+    lw            t0, 16(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 48(a2)
+    usw           t4, 52(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 56(a2)
+    usw           t6, 60(a2)
+
+    lw            t0, 20(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 64(a2)
+    usw           t4, 68(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 72(a2)
+    usw           t6, 76(a2)
+
+    lw            t0, 24(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 80(a2)
+    usw           t4, 84(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 88(a2)
+    usw           t6, 92(a2)
+
+    lw            t0, 28(a0)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu          t0, t0, a1
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    ulw           t1, 0(t0)
+    ulw           t2, 4(t0)
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 96(a2)
+    usw           t4, 100(a2)
+    preceu.ph.qbr t3, t1
+    preceu.ph.qbl t4, t1
+    usw           t5, 104(a2)
+    usw           t6, 108(a2)
+    preceu.ph.qbr t5, t2
+    preceu.ph.qbl t6, t2
+    addu.ph       t3, t3, t7
+    addu.ph       t4, t4, t7
+    addu.ph       t5, t5, t7
+    addu.ph       t6, t6, t7
+    usw           t3, 112(a2)
+    usw           t4, 116(a2)
+    usw           t5, 120(a2)
+    usw           t6, 124(a2)
+
+    j             ra
+     nop
+
+END(jsimd_convsamp_dspr2)
+
+
+#ifndef __mips_soft_float
+
+/*****************************************************************************/
+LEAF_DSPR2(jsimd_convsamp_float_dspr2)
+/*
+ * a0 = sample_data
+ * a1 = start_col
+ * a2 = workspace
+ */
+    .set at
+
+    lw          t0, 0(a0)
+    addu        t0, t0, a1
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 4(a0)
+    swc1        f2, 0(a2)
+    swc1        f4, 4(a2)
+    swc1        f6, 8(a2)
+    addu        t0, t0, a1
+    swc1        f8, 12(a2)
+    swc1        f10, 16(a2)
+    swc1        f12, 20(a2)
+    swc1        f14, 24(a2)
+    swc1        f16, 28(a2)
+    /* elemr 1 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 8(a0)
+    swc1        f2, 32(a2)
+    swc1        f4, 36(a2)
+    swc1        f6, 40(a2)
+    addu        t0, t0, a1
+    swc1        f8, 44(a2)
+    swc1        f10, 48(a2)
+    swc1        f12, 52(a2)
+    swc1        f14, 56(a2)
+    swc1        f16, 60(a2)
+    /* elemr 2 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 12(a0)
+    swc1        f2, 64(a2)
+    swc1        f4, 68(a2)
+    swc1        f6, 72(a2)
+    addu        t0, t0, a1
+    swc1        f8, 76(a2)
+    swc1        f10, 80(a2)
+    swc1        f12, 84(a2)
+    swc1        f14, 88(a2)
+    swc1        f16, 92(a2)
+    /*  elemr 3 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 16(a0)
+    swc1        f2, 96(a2)
+    swc1        f4, 100(a2)
+    swc1        f6, 104(a2)
+    addu        t0, t0, a1
+    swc1        f8, 108(a2)
+    swc1        f10, 112(a2)
+    swc1        f12, 116(a2)
+    swc1        f14, 120(a2)
+    swc1        f16, 124(a2)
+    /* elemr 4 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 20(a0)
+    swc1        f2, 128(a2)
+    swc1        f4, 132(a2)
+    swc1        f6, 136(a2)
+    addu        t0, t0, a1
+    swc1        f8, 140(a2)
+    swc1        f10, 144(a2)
+    swc1        f12, 148(a2)
+    swc1        f14, 152(a2)
+    swc1        f16, 156(a2)
+    /* elemr 5 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 24(a0)
+    swc1        f2, 160(a2)
+    swc1        f4, 164(a2)
+    swc1        f6, 168(a2)
+    addu        t0, t0, a1
+    swc1        f8, 172(a2)
+    swc1        f10, 176(a2)
+    swc1        f12, 180(a2)
+    swc1        f14, 184(a2)
+    swc1        f16, 188(a2)
+    /* elemr 6 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    lw          t0, 28(a0)
+    swc1        f2, 192(a2)
+    swc1        f4, 196(a2)
+    swc1        f6, 200(a2)
+    addu        t0, t0, a1
+    swc1        f8, 204(a2)
+    swc1        f10, 208(a2)
+    swc1        f12, 212(a2)
+    swc1        f14, 216(a2)
+    swc1        f16, 220(a2)
+    /* elemr 7 */
+    lbu         t1, 0(t0)
+    lbu         t2, 1(t0)
+    lbu         t3, 2(t0)
+    lbu         t4, 3(t0)
+    lbu         t5, 4(t0)
+    lbu         t6, 5(t0)
+    lbu         t7, 6(t0)
+    lbu         t8, 7(t0)
+    addiu       t1, t1, -128
+    addiu       t2, t2, -128
+    addiu       t3, t3, -128
+    addiu       t4, t4, -128
+    addiu       t5, t5, -128
+    addiu       t6, t6, -128
+    addiu       t7, t7, -128
+    addiu       t8, t8, -128
+    mtc1        t1, f2
+    mtc1        t2, f4
+    mtc1        t3, f6
+    mtc1        t4, f8
+    mtc1        t5, f10
+    mtc1        t6, f12
+    mtc1        t7, f14
+    mtc1        t8, f16
+    cvt.s.w     f2, f2
+    cvt.s.w     f4, f4
+    cvt.s.w     f6, f6
+    cvt.s.w     f8, f8
+    cvt.s.w     f10, f10
+    cvt.s.w     f12, f12
+    cvt.s.w     f14, f14
+    cvt.s.w     f16, f16
+    swc1        f2, 224(a2)
+    swc1        f4, 228(a2)
+    swc1        f6, 232(a2)
+    swc1        f8, 236(a2)
+    swc1        f10, 240(a2)
+    swc1        f12, 244(a2)
+    swc1        f14, 248(a2)
+    swc1        f16, 252(a2)
+
+    j           ra
+     nop
+
+END(jsimd_convsamp_float_dspr2)
+
+#endif
+
+/*****************************************************************************/
diff --git a/media/libjpeg/simd/mips/jsimd_dspr2_asm.h b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
new file mode 100644
index 0000000000..12cfda486c
--- /dev/null
+++ b/media/libjpeg/simd/mips/jsimd_dspr2_asm.h
@@ -0,0 +1,292 @@
+/*
+ * MIPS DSPr2 optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2013, MIPS Technologies, Inc., California.
+ * Copyright (C) 2018, Matthieu Darbois.
+ * All Rights Reserved.
+ * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
+ *           Darko Laus       (darko.laus@imgtec.com)
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define zero  $0
+#define AT    $1
+#define v0    $2
+#define v1    $3
+#define a0    $4
+#define a1    $5
+#define a2    $6
+#define a3    $7
+#define t0    $8
+#define t1    $9
+#define t2    $10
+#define t3    $11
+#define t4    $12
+#define t5    $13
+#define t6    $14
+#define t7    $15
+#define s0    $16
+#define s1    $17
+#define s2    $18
+#define s3    $19
+#define s4    $20
+#define s5    $21
+#define s6    $22
+#define s7    $23
+#define t8    $24
+#define t9    $25
+#define k0    $26
+#define k1    $27
+#define gp    $28
+#define sp    $29
+#define fp    $30
+#define s8    $30
+#define ra    $31
+
+#define f0    $f0
+#define f1    $f1
+#define f2    $f2
+#define f3    $f3
+#define f4    $f4
+#define f5    $f5
+#define f6    $f6
+#define f7    $f7
+#define f8    $f8
+#define f9    $f9
+#define f10   $f10
+#define f11   $f11
+#define f12   $f12
+#define f13   $f13
+#define f14   $f14
+#define f15   $f15
+#define f16   $f16
+#define f17   $f17
+#define f18   $f18
+#define f19   $f19
+#define f20   $f20
+#define f21   $f21
+#define f22   $f22
+#define f23   $f23
+#define f24   $f24
+#define f25   $f25
+#define f26   $f26
+#define f27   $f27
+#define f28   $f28
+#define f29   $f29
+#define f30   $f30
+#define f31   $f31
+
+#ifdef __ELF__
+#define HIDDEN_SYMBOL(symbol)  .hidden symbol;
+#else
+#define HIDDEN_SYMBOL(symbol)
+#endif
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol) \
+    .globl      symbol; \
+    HIDDEN_SYMBOL(symbol) \
+    .align      2; \
+    .type       symbol, @function; \
+    .ent        symbol, 0; \
+symbol: \
+    .frame      sp, 0, ra; \
+    .set        push; \
+    .set        arch = mips32r2; \
+    .set        noreorder; \
+    .set        noat;
+
+/*
+ * LEAF_DSPR2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_DSPR2(symbol) \
+LEAF_MIPS32R2(symbol) \
+    .set        dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function) \
+    .set        pop; \
+    .end        function; \
+    .size       function, .-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK  stack_offset = 0, r1, \
+                           r2  = 0, r3  = 0, r4  = 0, \
+                           r5  = 0, r6  = 0, r7  = 0, \
+                           r8  = 0, r9  = 0, r10 = 0, \
+                           r11 = 0, r12 = 0, r13 = 0, \
+                           r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, -\stack_offset
+.endif
+    sw          \r1, 0(sp)
+.if \r2 != 0
+    sw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    sw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    sw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw          \r14, 52(sp)
+.endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK  stack_offset = 0, r1, \
+                                r2  = 0, r3  = 0, r4  = 0, \
+                                r5  = 0, r6  = 0, r7  = 0, \
+                                r8  = 0, r9  = 0, r10 = 0, \
+                                r11 = 0, r12 = 0, r13 = 0, \
+                                r14 = 0
+.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+.endif
+    lw          \r1, 0(sp)
+.if \r2 != 0
+    lw          \r2, 4(sp)
+.endif
+.if \r3 != 0
+    lw          \r3, 8(sp)
+.endif
+.if \r4 != 0
+    lw          \r4, 12(sp)
+.endif
+.if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw          \r5, 16(sp)
+.endif
+.if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw          \r6, 20(sp)
+.endif
+.if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw          \r7, 24(sp)
+.endif
+.if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw          \r8, 28(sp)
+.endif
+.if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw          \r9, 32(sp)
+.endif
+.if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw          \r10, 36(sp)
+.endif
+.if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw          \r11, 40(sp)
+.endif
+.if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw          \r12, 44(sp)
+.endif
+.if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw          \r13, 48(sp)
+.endif
+.if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw          \r14, 52(sp)
+.endif
+.if \stack_offset != 0
+    addiu       sp, sp, \stack_offset
+.endif
+.endm
diff --git a/media/libjpeg/simd/mips64/jccolext-mmi.c b/media/libjpeg/simd/mips64/jccolext-mmi.c
new file mode 100644
index 0000000000..558eb2ab10
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+  __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+      blo = _mm_loadlo_pi16_f(bo);
+      bho = _mm_loadhi_pi16_f(bo);
+      halfblo = _mm_srli_pi32(blo, 1);
+      halfbho = _mm_srli_pi32(bho, 1);
+
+      cblo = _mm_add_pi32(cblo, halfblo);
+      cbho = _mm_add_pi32(cbho, halfbho);
+      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+      cblo = _mm_srli_pi32(cblo, SCALEBITS);
+      cbho = _mm_srli_pi32(cbho, SCALEBITS);
+      cbo = _mm_packs_pi32(cblo, cbho);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+      ble = _mm_loadlo_pi16_f(be);
+      bhe = _mm_loadhi_pi16_f(be);
+      halfble = _mm_srli_pi32(ble, 1);
+      halfbhe = _mm_srli_pi32(bhe, 1);
+
+      cble = _mm_add_pi32(cble, halfble);
+      cbhe = _mm_add_pi32(cbhe, halfbhe);
+      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+      cble = _mm_srli_pi32(cble, SCALEBITS);
+      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+      cbe = _mm_packs_pi32(cble, cbhe);
+
+      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+      cb = _mm_or_si64(cbe, cbo);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      rlo = _mm_loadlo_pi16_f(ro);
+      rho = _mm_loadhi_pi16_f(ro);
+      halfrlo = _mm_srli_pi32(rlo, 1);
+      halfrho = _mm_srli_pi32(rho, 1);
+
+      crlo = _mm_add_pi32(crlo, halfrlo);
+      crho = _mm_add_pi32(crho, halfrho);
+      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+      crlo = _mm_srli_pi32(crlo, SCALEBITS);
+      crho = _mm_srli_pi32(crho, SCALEBITS);
+      cro = _mm_packs_pi32(crlo, crho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      rle = _mm_loadlo_pi16_f(re);
+      rhe = _mm_loadhi_pi16_f(re);
+      halfrle = _mm_srli_pi32(rle, 1);
+      halfrhe = _mm_srli_pi32(rhe, 1);
+
+      crle = _mm_add_pi32(crle, halfrle);
+      crhe = _mm_add_pi32(crhe, halfrhe);
+      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+      crle = _mm_srli_pi32(crle, SCALEBITS);
+      crhe = _mm_srli_pi32(crhe, SCALEBITS);
+      cre = _mm_packs_pi32(crle, crhe);
+
+      cro = _mm_slli_pi16(cro, BYTE_BIT);
+      cr = _mm_or_si64(cre, cro);
+
+      _mm_store_si64((__m64 *)&outptr0[0], y);
+      _mm_store_si64((__m64 *)&outptr1[0], cb);
+      _mm_store_si64((__m64 *)&outptr2[0], cr);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jccolor-mmi.c b/media/libjpeg/simd/mips64/jccolor-mmi.c
new file mode 100644
index 0000000000..93ef5c79f7
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jccolor-mmi.c
@@ -0,0 +1,148 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_081  ((short)5329)                /* FIX(0.08131) */
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_168  ((short)11059)               /* FIX(0.16874) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_331  ((short)21709)               /* FIX(0.33126) */
+#define F_0_418  ((short)27439)               /* FIX(0.41869) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250,
+  index_PW_MF016_MF033,
+  index_PW_MF008_MF041,
+  index_PD_ONEHALFM1_CJ
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114),
+  _uint64_set_pi16(-F_0_331, -F_0_168, -F_0_331, -F_0_168),
+  _uint64_set_pi16(-F_0_418, -F_0_081, -F_0_418, -F_0_081),
+  _uint64_set_pi32(((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)),
+                   ((1 << (SCALEBITS - 1)) - 1 + (CENTERJSAMPLE << SCALEBITS)))
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+#define PW_MF016_MF033   get_const_value(index_PW_MF016_MF033)
+#define PW_MF008_MF041   get_const_value(index_PW_MF008_MF041)
+#define PD_ONEHALFM1_CJ  get_const_value(index_PD_ONEHALFM1_CJ)
+
+
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extrgbx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extbgrx_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxbgr_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_mmi  jsimd_extxrgb_ycc_convert_mmi
+#include "jccolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgray-mmi.c b/media/libjpeg/simd/mips64/jcgray-mmi.c
new file mode 100644
index 0000000000..9c7b833f2e
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jcgryext-mmi.c b/media/libjpeg/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 0000000000..08a83d6699
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      _mm_store_si64((__m64 *)&outptr[0], y);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jcsample-mmi.c b/media/libjpeg/simd/mips64/jcsample-mmi.c
new file mode 100644
index 0000000000..0354dac087
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jcsample-mmi.c
@@ -0,0 +1,98 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA DOWNSAMPLING */
+
+#include "jsimd_mmi.h"
+#include "jcsample.h"
+
+
+void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
+                               JDIMENSION v_samp_factor,
+                               JDIMENSION width_in_blocks,
+                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  int inrow, outrow, outcol;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
+  JSAMPROW inptr0, inptr1, outptr;
+  __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+  __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+  __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
+
+  expand_right_edge(input_data, max_v_samp_factor, image_width,
+                    output_cols * 2);
+
+  bias = _mm_set1_pi32((1 << 17) + 1);   /* 0x00020001 (32-bit bias pattern) */
+                                         /* bias={1, 2, 1, 2} (16-bit) */
+  mask = _mm_cmpeq_pi16(mask, mask);
+  mask = _mm_srli_pi16(mask, BYTE_BIT);  /* {0xFF 0x00 0xFF 0x00 ..} */
+
+  for (inrow = 0, outrow = 0; outrow < v_samp_factor;
+       inrow += 2, outrow++) {
+
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr = output_data[outrow];
+
+    for (outcol = output_cols; outcol > 0;
+         outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
+
+      this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+      next1 = _mm_load_si64((__m64 *)&inptr1[8]);
+
+      this0o = _mm_and_si64(this0, mask);
+      this0e = _mm_srli_pi16(this0, BYTE_BIT);
+      this1o = _mm_and_si64(this1, mask);
+      this1e = _mm_srli_pi16(this1, BYTE_BIT);
+      this0sum = _mm_add_pi16(this0o, this0e);
+      this1sum = _mm_add_pi16(this1o, this1e);
+
+      next0o = _mm_and_si64(next0, mask);
+      next0e = _mm_srli_pi16(next0, BYTE_BIT);
+      next1o = _mm_and_si64(next1, mask);
+      next1e = _mm_srli_pi16(next1, BYTE_BIT);
+      next0sum = _mm_add_pi16(next0o, next0e);
+      next1sum = _mm_add_pi16(next1o, next1e);
+
+      thisavg = _mm_add_pi16(this0sum, this1sum);
+      nextavg = _mm_add_pi16(next0sum, next1sum);
+      thisavg = _mm_add_pi16(thisavg, bias);
+      nextavg = _mm_add_pi16(nextavg, bias);
+      thisavg = _mm_srli_pi16(thisavg, 2);
+      nextavg = _mm_srli_pi16(nextavg, 2);
+
+      avg = _mm_packs_pu16(thisavg, nextavg);
+
+      _mm_store_si64((__m64 *)&outptr[0], avg);
+    }
+  }
+}
diff --git a/media/libjpeg/simd/jcsample.h b/media/libjpeg/simd/mips64/jcsample.h
similarity index 68%
rename from media/libjpeg/simd/jcsample.h
rename to media/libjpeg/simd/mips64/jcsample.h
index 2a50544e97..bd07fcc4ed 100644
--- a/media/libjpeg/simd/jcsample.h
+++ b/media/libjpeg/simd/mips64/jcsample.h
@@ -8,19 +8,19 @@
  */
 
 LOCAL(void)
-expand_right_edge (JSAMPARRAY image_data, int num_rows,
-                   JDIMENSION input_cols, JDIMENSION output_cols)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
 {
   register JSAMPROW ptr;
   register JSAMPLE pixval;
   register int count;
   int row;
-  int numcols = (int) (output_cols - input_cols);
+  int numcols = (int)(output_cols - input_cols);
 
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
diff --git a/media/libjpeg/simd/mips64/jdcolext-mmi.c b/media/libjpeg/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 0000000000..3b5b2f2030
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      mask = decenter = 0.0;
+      mask = _mm_cmpeq_pi16(mask, mask);
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
+      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
+      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
+      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
+      cbe = _mm_add_pi16(cbe, decenter);
+      cbo = _mm_add_pi16(cbo, decenter);
+      cre = _mm_add_pi16(cre, decenter);
+      cro = _mm_add_pi16(cro, decenter);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
+      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
+      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
+      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
+
+      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
+      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
+      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
+      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, PW_ONE);
+      bo = _mm_add_pi16(bo, PW_ONE);
+      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
+      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
+      re = _mm_add_pi16(re, PW_ONE);
+      ro = _mm_add_pi16(ro, PW_ONE);
+      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
+      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, cbe);
+      bo = _mm_add_pi16(bo, cbo);
+      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
+      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
+      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
+      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
+
+      gle = _mm_unpacklo_pi16(cbe, cre);
+      ghe = _mm_unpackhi_pi16(cbe, cre);
+      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+      glo = _mm_unpacklo_pi16(cbo, cro);
+      gho = _mm_unpackhi_pi16(cbo, cro);
+      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+      gle = _mm_add_pi32(gle, PD_ONEHALF);
+      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+      gle = _mm_srai_pi32(gle, SCALEBITS);
+      ghe = _mm_srai_pi32(ghe, SCALEBITS);
+      glo = _mm_add_pi32(glo, PD_ONEHALF);
+      gho = _mm_add_pi32(gho, PD_ONEHALF);
+      glo = _mm_srai_pi32(glo, SCALEBITS);
+      gho = _mm_srai_pi32(gho, SCALEBITS);
+
+      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      ye = _mm_and_si64(mask, y);             /* Y(0246) */
+      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
+
+      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
+      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
+
+      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
+      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
+
+      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
+      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
+
+      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
+
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
+
+      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
+      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
+
+      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
+      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmE);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 16\r\n"
+            "move     $9, %4\r\n"
+            "mov.s    $f4, %1\r\n"
+            "mov.s    $f6, %3\r\n"
+            "move     $10, %5\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %2\r\n"
+            "subu     $9, $9, 16\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+            "b        2f\r\n"
+            "nop      \r\n"
+
+            "1:       \r\n"
+            "li       $8, 8\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "subu     $9, $9, 8\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"              /* st4 */
+            "mfc1     $11, $f4\r\n"
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "swl      $11, 3($10)\r\n"
+            "swr      $11, 0($10)\r\n"
+            "li       $8, 32\r\n"
+            "mtc1     $8, $f6\r\n"
+            "dsrl     $f4, $f4, $f6\r\n"
+            "mfc1     $11, $f4\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 4\r\n"
+
+            "3:       \r\n"
+            "li       $8, 2\r\n"              /* st2 */
+            "bltu     $9, $8, 4f\r\n"
+            "nop      \r\n"
+            "ush      $11, 0($10)\r\n"
+            "srl      $11, 16\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 2\r\n"
+
+            "4:       \r\n"
+            "li       $8, 1\r\n"              /* st1 */
+            "bltu     $9, $8, 5f\r\n"
+            "nop      \r\n"
+            "sb       $11, 0($10)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"                   /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+      xo = _mm_cmpeq_pi8(xo, xo);
+#else
+      xe = _mm_xor_si64(xe, xe);
+      xo = _mm_xor_si64(xo, xo);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
+
+      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
+      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
+      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
+
+      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
+      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
+      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
+      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmD);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+          _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li       $8, 4\r\n"
+            "move     $9, %6\r\n"
+            "move     $10, %7\r\n"
+            "mov.s    $f4, %2\r\n"
+            "mov.s    $f6, %4\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "mov.s    $f6, %5\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, $f6\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 1\r\n"              /* st4 */
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "gsswlc1  $f4, 3($10)\r\n"
+            "gsswrc1  $f4, 0($10)\r\n"
+
+            "3:       \r\n"
+            "li       %1, 0\r\n"              /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdcolor-mmi.c b/media/libjpeg/simd/mips64/jdcolor-mmi.c
new file mode 100644
index 0000000000..2c58263dbd
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdcolor-mmi.c
@@ -0,0 +1,139 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extrgbx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extbgrx_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxbgr_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_mmi  jsimd_ycc_extxrgb_convert_mmi
+#include "jdcolext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_mmi
diff --git a/media/libjpeg/simd/mips64/jdmerge-mmi.c b/media/libjpeg/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 0000000000..0a39bd5680
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/media/libjpeg/simd/mips64/jdmrgext-mmi.c b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 0000000000..be09ff2a65
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+  __m64 mm8, mm9;
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+    cb = _mm_load_si64((__m64 *)inptr1);
+    cr = _mm_load_si64((__m64 *)inptr2);
+    ythis = _mm_load_si64((__m64 *)inptr0);
+    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+    mask = decenter = 0.0;
+    mask = _mm_cmpeq_pi16(mask, mask);
+    decenter = _mm_cmpeq_pi16(decenter, decenter);
+    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
+    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
+    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
+    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
+    cbl = _mm_add_pi16(cbl, decenter);
+    cbh = _mm_add_pi16(cbh, decenter);
+    crl = _mm_add_pi16(crl, decenter);
+    crh = _mm_add_pi16(crh, decenter);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+
+    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
+    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
+    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
+    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
+
+    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
+    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
+    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
+    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, PW_ONE);
+    bh = _mm_add_pi16(bh, PW_ONE);
+    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
+    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
+    rl = _mm_add_pi16(rl, PW_ONE);
+    rh = _mm_add_pi16(rh, PW_ONE);
+    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
+    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, cbl);
+    bh = _mm_add_pi16(bh, cbh);
+    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
+    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
+    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
+    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
+
+    ga = _mm_unpacklo_pi16(cbl, crl);
+    gb = _mm_unpackhi_pi16(cbl, crl);
+    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+    gc = _mm_unpacklo_pi16(cbh, crh);
+    gd = _mm_unpackhi_pi16(cbh, crh);
+    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+    ga = _mm_add_pi32(ga, PD_ONEHALF);
+    gb = _mm_add_pi32(gb, PD_ONEHALF);
+    ga = _mm_srai_pi32(ga, SCALEBITS);
+    gb = _mm_srai_pi32(gb, SCALEBITS);
+    gc = _mm_add_pi32(gc, PD_ONEHALF);
+    gd = _mm_add_pi32(gd, PD_ONEHALF);
+    gc = _mm_srai_pi32(gc, SCALEBITS);
+    gd = _mm_srai_pi32(gd, SCALEBITS);
+
+    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
+    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
+    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
+    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
+
+    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
+    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
+    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
+    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
+    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
+    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
+    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
+    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
+    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
+    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
+    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
+    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
+    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
+    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
+    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
+    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
+    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
+    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
+    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
+    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
+    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
+    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
+    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
+    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
+
+    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
+    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
+    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
+    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
+    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
+    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
+
+    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
+    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
+    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
+    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
+    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmE);
+        _mm_store_si64((__m64 *)(outptr + 32), mmF);
+        _mm_store_si64((__m64 *)(outptr + 40), mmG);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 6 + 3;
+      else
+        col = num_cols * 6;
+
+      asm(".set noreorder\r\n"                /* st24 */
+
+          "li       $8, 24\r\n"
+          "move     $9, %7\r\n"
+          "mov.s    $f4, %1\r\n"
+          "mov.s    $f6, %2\r\n"
+          "mov.s    $f8, %3\r\n"
+          "move     $10, %8\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "mov.s    $f4, %4\r\n"
+          "mov.s    $f6, %5\r\n"
+          "mov.s    $f8, %6\r\n"
+          "subu     $9, $9, 24\r\n"
+          PTR_ADDU  "$10, $10, 24\r\n"
+
+          "1:       \r\n"
+          "li       $8, 16\r\n"               /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "subu     $9, $9, 16\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8,  8\r\n"               /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8,  4\r\n"               /* st4 */
+          "mfc1     $11, $f4\r\n"
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "swl      $11, 3($10)\r\n"
+          "swr      $11, 0($10)\r\n"
+          "li       $8, 32\r\n"
+          "mtc1     $8, $f6\r\n"
+          "dsrl     $f4, $f4, $f6\r\n"
+          "mfc1     $11, $f4\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 4\r\n"
+
+          "4:       \r\n"
+          "li       $8, 2\r\n"                /* st2 */
+          "bltu     $9, $8, 5f\r\n"
+          "nop      \r\n"
+          "ush      $11, 0($10)\r\n"
+          "srl      $11, 16\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 2\r\n"
+
+          "5:       \r\n"
+          "li       $8, 1\r\n"                /* st1 */
+          "bltu     $9, $8, 6f\r\n"
+          "nop      \r\n"
+          "sb       $11, 0($10)\r\n"
+
+          "6:       \r\n"
+          "nop      \r\n"                     /* end */
+          : "=m" (*outptr)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+            "f" (mmG), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+         );
+    }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+    xe = _mm_cmpeq_pi8(xe, xe);
+    xo = _mm_cmpeq_pi8(xo, xo);
+#else
+    xe = _mm_xor_si64(xe, xe);
+    xo = _mm_xor_si64(xo, xo);
+#endif
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
+    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
+    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
+    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
+    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
+    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
+    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
+    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
+
+    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
+    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
+    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
+    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
+    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
+    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
+
+    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmD);
+        _mm_store_si64((__m64 *)(outptr + 32), mmE);
+        _mm_store_si64((__m64 *)(outptr + 40), mmF);
+        _mm_store_si64((__m64 *)(outptr + 48), mmG);
+        _mm_store_si64((__m64 *)(outptr + 56), mmH);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 2 + 1;
+      else
+        col = num_cols * 2;
+      asm(".set noreorder\r\n"                /* st32 */
+
+          "li       $8, 8\r\n"
+          "move     $9, %10\r\n"
+          "move     $10, %11\r\n"
+          "mov.s    $f4, %2\r\n"
+          "mov.s    $f6, %3\r\n"
+          "mov.s    $f8, %4\r\n"
+          "mov.s    $f10, %5\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "gssdlc1  $f10, 7+24($10)\r\n"
+          "gssdrc1  $f10, 24($10)\r\n"
+          "mov.s    $f4, %6\r\n"
+          "mov.s    $f6, %7\r\n"
+          "mov.s    $f8, %8\r\n"
+          "mov.s    $f10, %9\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 32\r\n"
+
+          "1:       \r\n"
+          "li       $8, 4\r\n"                /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "mov.s    $f6, $f10\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8, 2\r\n"                /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8, 1\r\n"                /* st4 */
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "gsswlc1  $f4, 3($10)\r\n"
+          "gsswrc1  $f4, 0($10)\r\n"
+
+          "4:       \r\n"
+          "li       %1, 0\r\n"                /* end */
+          : "=m" (*outptr), "=r" (col)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+         );
+    }
+
+#endif
+
+  }
+
+  if (!((output_width >> 1) & 7)) {
+    if (output_width & 1) {
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      decenter = 0.0;
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
+      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
+      cbl = _mm_add_pi16(cbl, decenter);
+      crl = _mm_add_pi16(crl, decenter);
+
+      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
+      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
+      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
+      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, PW_ONE);
+      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
+      rl = _mm_add_pi16(rl, PW_ONE);
+      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, cbl);
+      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
+      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
+
+      gl = _mm_unpacklo_pi16(cbl, crl);
+      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+      gl = _mm_add_pi32(gl, PD_ONEHALF);
+      gl = _mm_srai_pi32(gl, SCALEBITS);
+      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
+      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
+      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
+      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
+      re = _mm_packs_pu16(rl, rl);
+      ge = _mm_packs_pu16(gl, gl);
+      be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "mfc1    $9, $f4\r\n"
+          "ush     $9, 0($8)\r\n"
+          "srl     $9, 16\r\n"
+          "sb      $9, 2($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "$9", "memory"
+         );
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+#else
+      xe = _mm_xor_si64(xe, xe);
+#endif
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "gsswlc1 $f4, 3($8)\r\n"
+          "gsswrc1 $f4, 0($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "memory"
+         );
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/media/libjpeg/simd/mips64/jdsample-mmi.c b/media/libjpeg/simd/mips64/jdsample-mmi.c
new file mode 100644
index 0000000000..8ae94e7dcf
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_TWO,
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(2, 2, 2, 2),
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE    get_const_value(index_PW_ONE)
+#define PW_TWO    get_const_value(index_PW_TWO)
+#define PW_THREE  get_const_value(index_PW_THREE)
+#define PW_SEVEN  get_const_value(index_PW_SEVEN)
+#define PW_EIGHT  get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+  __m64 outle, outhe, outlo, outho, outl, outh; \
+  \
+  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
+  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
+  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
+  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
+  \
+  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
+  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
+  \
+  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
+  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
+  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
+  \
+  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
+  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
+  \
+  wk[row] = samp7XXX; \
+  \
+  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+  samp3456 = _mm_add_pi16(samp3456, bias1); \
+  samp1234 = _mm_add_pi16(samp1234, bias2); \
+  samp5678 = _mm_add_pi16(samp5678, bias2); \
+  \
+  outle = _mm_add_pi16(samp_1012, samp0123); \
+  outhe = _mm_add_pi16(samp3456, samp4567); \
+  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
+  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
+  outlo = _mm_add_pi16(samp1234, samp0123); \
+  outho = _mm_add_pi16(samp5678, samp4567); \
+  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
+  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
+  \
+  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+  outho = _mm_slli_pi16(outho, BYTE_BIT); \
+  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
+  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##row, outl); \
+  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+  __m64 this0l, this0h, this0;
+  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+  __m64 next0l, next0h, next0;
+  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %3, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %3, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
+    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
+
+    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
+    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
+    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
+    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
+    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
+    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
+
+    this0l = _mm_mullo_pi16(this0l, PW_THREE);
+    this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
+    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
+    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
+    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
+
+    /* temporarily save the intermediate data */
+    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
+    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
+        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
+        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
+
+        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
+        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
+        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
+        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
+        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
+        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
+
+        next0l = _mm_mullo_pi16(next0l, PW_THREE);
+        next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
+        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
+        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
+        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
+
+        /* temporarily save the intermediate data */
+        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
+      } else {
+        __m64 tmp;
+
+        /* process the last column block */
+        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+      }
+
+      /* process the upper row */
+      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+      /* process the lower row */
+      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+    }
+  }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, outptr0;
+  int inrow, incol, tmp, tmp1;
+  __m64 thisl, this, nextl, next;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+    inptr0 = input_data[inrow];
+    outptr0 = output_data[inrow];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %1, %2\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %1, %3\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr0)
+          : "r" (inptr0), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
+    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
+        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
+        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+      } else {
+        __m64 thish;
+
+        /* process the last column block */
+        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
+        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
+        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
+      }
+
+      /* process the row */
+      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
+      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
+      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+    }
+  }
+}
diff --git a/media/libjpeg/simd/mips64/jfdctfst-mmi.c b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 0000000000..f7caf09a88
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+
+#define F_0_382  ((short)98)   /* FIX(0.382683433) */
+#define F_0_541  ((short)139)  /* FIX(0.541196100) */
+#define F_0_707  ((short)181)  /* FIX(0.707106781) */
+#define F_1_306  ((short)334)  /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F0707,
+  index_PW_F0382,
+  index_PW_F0541,
+  index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(F_0_707),
+  _uint64_set1_pi16(F_0_382),
+  _uint64_set1_pi16(F_0_541),
+  _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707  get_const_value(index_PW_F0707)
+#define PW_F0382  get_const_value(index_PW_F0382)
+#define PW_F0541  get_const_value(index_PW_F0541)
+#define PW_F1306  get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+  __m64 mulhi, mullo, mul12, mul34; \
+  \
+  mullo = _mm_mullo_pi16(in, multiplier); \
+  mulhi = _mm_mulhi_pi16(in, multiplier); \
+  mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+  mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+  mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+  mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+  out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+  tmp11 = _mm_add_pi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11); \
+  out4 = _mm_sub_pi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_pi16(tmp12, tmp13); \
+  DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+  \
+  out2 = _mm_add_pi16(tmp13, z1); \
+  out6 = _mm_sub_pi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp4, tmp5); \
+  tmp11 = _mm_add_pi16(tmp5, tmp6); \
+  tmp12 = _mm_add_pi16(tmp6, tmp7); \
+  \
+  z5 = _mm_sub_pi16(tmp10, tmp12); \
+  DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+  \
+  DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+  z2 = _mm_add_pi16(z2, z5); \
+  \
+  DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+  z4 = _mm_add_pi16(z4, z5); \
+  \
+  DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+  \
+  z11 = _mm_add_pi16(tmp7, z3); \
+  z13 = _mm_sub_pi16(tmp7, z3); \
+  \
+  out5 = _mm_add_pi16(z13, z2); \
+  out3 = _mm_sub_pi16(z13, z2); \
+  out1 = _mm_add_pi16(z11, z4); \
+  out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jfdctint-mmi.c b/media/libjpeg/simd/mips64/jfdctint-mmi.c
new file mode 100644
index 0000000000..7f4dfe9123
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jfdctint-mmi.c
@@ -0,0 +1,398 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define FIX_0_298  ((short)2446)   /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)   /* FIX(0.390180644) */
+#define FIX_0_541  ((short)4433)   /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)   /* FIX(0.765366865) */
+#define FIX_0_899  ((short)7373)   /* FIX(0.899976223) */
+#define FIX_1_175  ((short)9633)   /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299)  /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137)  /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069)  /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819)  /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995)  /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172)  /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PW_DESCALE_P2X
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi16((1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)),
+                   (1 << (PASS1_BITS - 1)), (1 << (PASS1_BITS - 1)))
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PW_DESCALE_P2X  get_const_value(index_PW_DESCALE_P2X)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  __m64 tmp1312l, tmp1312h, tmp47l, tmp47h, tmp4l, tmp4h, tmp7l, tmp7h; \
+  __m64 tmp56l, tmp56h, tmp5l, tmp5h, tmp6l, tmp6h; \
+  __m64 out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out5l, out5h, out6l, out6h, out7l, out7h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * out2 = z1 + tmp13 * 0.765366865; \
+   * out6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * out2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * out6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = _mm_unpacklo_pi16(tmp13, tmp12); \
+  tmp1312h = _mm_unpackhi_pi16(tmp13, tmp12); \
+  \
+  out2l = _mm_madd_pi16(tmp1312l, PW_F130_F054); \
+  out2h = _mm_madd_pi16(tmp1312h, PW_F130_F054); \
+  out6l = _mm_madd_pi16(tmp1312l, PW_F054_MF130); \
+  out6h = _mm_madd_pi16(tmp1312h, PW_F054_MF130); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = _mm_add_pi16(tmp4, tmp6); \
+  z4 = _mm_add_pi16(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * out7 = tmp4 + z1 + z3;  out5 = tmp5 + z2 + z4; \
+   * out3 = tmp6 + z2 + z3;  out1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * out7 = tmp4 + z3;  out5 = tmp5 + z4; \
+   * out3 = tmp6 + z3;  out1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = _mm_unpacklo_pi16(tmp4, tmp7); \
+  tmp47h = _mm_unpackhi_pi16(tmp4, tmp7); \
+  \
+  tmp4l = _mm_madd_pi16(tmp47l, PW_MF060_MF089); \
+  tmp4h = _mm_madd_pi16(tmp47h, PW_MF060_MF089); \
+  tmp7l = _mm_madd_pi16(tmp47l, PW_MF089_F060); \
+  tmp7h = _mm_madd_pi16(tmp47h, PW_MF089_F060); \
+  \
+  out7l = _mm_add_pi32(tmp4l, z3l); \
+  out7h = _mm_add_pi32(tmp4h, z3h); \
+  out1l = _mm_add_pi32(tmp7l, z4l); \
+  out1h = _mm_add_pi32(tmp7h, z4h); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  \
+  tmp56l = _mm_unpacklo_pi16(tmp5, tmp6); \
+  tmp56h = _mm_unpackhi_pi16(tmp5, tmp6); \
+  \
+  tmp5l = _mm_madd_pi16(tmp56l, PW_MF050_MF256); \
+  tmp5h = _mm_madd_pi16(tmp56h, PW_MF050_MF256); \
+  tmp6l = _mm_madd_pi16(tmp56l, PW_MF256_F050); \
+  tmp6h = _mm_madd_pi16(tmp56h, PW_MF256_F050); \
+  \
+  out5l = _mm_add_pi32(tmp5l, z4l); \
+  out5h = _mm_add_pi32(tmp5h, z4h); \
+  out3l = _mm_add_pi32(tmp6l, z3l); \
+  out3h = _mm_add_pi32(tmp6h, z3h); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  __m64 tmp10, tmp11; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  out0 = _mm_slli_pi16(out0, PASS1_BITS); \
+  out4 = _mm_slli_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(1) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  __m64 tmp10, tmp11; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3);           /* tmp10=tmp0+tmp3 */ \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3);           /* tmp13=tmp0-tmp3 */ \
+  tmp11 = _mm_add_pi16(tmp1, tmp2);           /* tmp11=tmp1+tmp2 */ \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2);           /* tmp12=tmp1-tmp2 */ \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11);          /* out0=tmp10+tmp11 */ \
+  out4 = _mm_sub_pi16(tmp10, tmp11);          /* out4=tmp10-tmp11 */ \
+  \
+  out0 = _mm_add_pi16(out0, PW_DESCALE_P2X); \
+  out4 = _mm_add_pi16(out4, PW_DESCALE_P2X); \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS); \
+  \
+  DO_FDCT_COMMON(2) \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_islow_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp12, tmp13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/media/libjpeg/simd/mips64/jidctfst-mmi.c b/media/libjpeg/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 0000000000..503bb35a3c
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+#define FIX_1_082  ((short)277)                   /* FIX(1.082392200) */
+#define FIX_1_414  ((short)362)                   /* FIX(1.414213562) */
+#define FIX_1_847  ((short)473)                   /* FIX(1.847759065) */
+#define FIX_2_613  ((short)669)                   /* FIX(2.613125930) */
+#define FIX_1_613  ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F1082,
+  index_PW_F1414,
+  index_PW_F1847,
+  index_PW_MF1613,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+  _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+  _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414        get_const_value(index_PW_F1414)
+#define PW_F1847        get_const_value(index_PW_F1847)
+#define PW_MF1613       get_const_value(index_PW_MF1613)
+#define PW_F1082        get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+  tmp7 = _mm_add_pi16(z11, z13); \
+  \
+  tmp11 = _mm_sub_pi16(z11, z13); \
+  tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+  \
+  tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_pi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+  \
+  tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+  tmp10 = _mm_sub_pi16(tmp10, z5); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_add_pi16(tmp12, z5); \
+  \
+  /* Final output stage */ \
+  \
+  tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+  tmp4 = _mm_add_pi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_pi16(tmp0, tmp7); \
+  out7 = _mm_sub_pi16(tmp0, tmp7); \
+  out1 = _mm_add_pi16(tmp1, tmp6); \
+  out6 = _mm_sub_pi16(tmp1, tmp6); \
+  \
+  out2 = _mm_add_pi16(tmp2, tmp5); \
+  out5 = _mm_sub_pi16(tmp2, tmp5); \
+  out4 = _mm_add_pi16(tmp3, tmp4); \
+  out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l);    /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+  tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+  tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+  tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp2); \
+  tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+  tmp13 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+  tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+  \
+  z13 = _mm_add_pi16(tmp6, tmp5); \
+  z10 = _mm_sub_pi16(tmp6, tmp5); \
+  z11 = _mm_add_pi16(tmp4, tmp7); \
+  z12 = _mm_sub_pi16(tmp4, tmp7); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h; \
+  __m64 col0, col1, col2, col3; \
+  __m64 row06, row17, row24, row35; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(row0l, row4l); \
+  tmp11 = _mm_sub_pi16(row0l, row4l); \
+  tmp13 = _mm_add_pi16(row2l, row6l); \
+  \
+  tmp12 = _mm_sub_pi16(row2l, row6l); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_pi16(row5l, row3l); \
+  z10 = _mm_sub_pi16(row5l, row3l); \
+  z11 = _mm_add_pi16(row1l, row7l); \
+  z12 = _mm_sub_pi16(row1l, row7l); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+  out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+  out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+  out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+  out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+  out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+  out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 tmp10, tmp11, tmp12, tmp13;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jidctint-mmi.c b/media/libjpeg/simd/mips64/jidctint-mmi.c
new file mode 100644
index 0000000000..cd3db980c5
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jidctint-mmi.c
@@ -0,0 +1,571 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018, 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCUATE INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+#define CENTERJSAMPLE  128
+
+#define FIX_0_298  ((short)2446)  /* FIX(0.298631336) */
+#define FIX_0_390  ((short)3196)  /* FIX(0.390180644) */
+#define FIX_0_899  ((short)7373)  /* FIX(0.899976223) */
+#define FIX_0_541  ((short)4433)  /* FIX(0.541196100) */
+#define FIX_0_765  ((short)6270)  /* FIX(0.765366865) */
+#define FIX_1_175  ((short)9633)  /* FIX(1.175875602) */
+#define FIX_1_501  ((short)12299) /* FIX(1.501321110) */
+#define FIX_1_847  ((short)15137) /* FIX(1.847759065) */
+#define FIX_1_961  ((short)16069) /* FIX(1.961570560) */
+#define FIX_2_053  ((short)16819) /* FIX(2.053119869) */
+#define FIX_2_562  ((short)20995) /* FIX(2.562915447) */
+#define FIX_3_072  ((short)25172) /* FIX(3.072711026) */
+
+enum const_index {
+  index_PW_F130_F054,
+  index_PW_F054_MF130,
+  index_PW_MF078_F117,
+  index_PW_F117_F078,
+  index_PW_MF060_MF089,
+  index_PW_MF089_F060,
+  index_PW_MF050_MF256,
+  index_PW_MF256_F050,
+  index_PD_DESCALE_P1,
+  index_PD_DESCALE_P2,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(FIX_0_541, (FIX_0_541 + FIX_0_765),
+                   FIX_0_541, (FIX_0_541 + FIX_0_765)),
+  _uint64_set_pi16((FIX_0_541 - FIX_1_847), FIX_0_541,
+                   (FIX_0_541 - FIX_1_847), FIX_0_541),
+  _uint64_set_pi16(FIX_1_175, (FIX_1_175 - FIX_1_961),
+                   FIX_1_175, (FIX_1_175 - FIX_1_961)),
+  _uint64_set_pi16((FIX_1_175 - FIX_0_390), FIX_1_175,
+                   (FIX_1_175 - FIX_0_390), FIX_1_175),
+  _uint64_set_pi16(-FIX_0_899, (FIX_0_298 - FIX_0_899),
+                   -FIX_0_899, (FIX_0_298 - FIX_0_899)),
+  _uint64_set_pi16((FIX_1_501 - FIX_0_899), -FIX_0_899,
+                   (FIX_1_501 - FIX_0_899), -FIX_0_899),
+  _uint64_set_pi16(-FIX_2_562, (FIX_2_053 - FIX_2_562),
+                   -FIX_2_562, (FIX_2_053 - FIX_2_562)),
+  _uint64_set_pi16((FIX_3_072 - FIX_2_562), -FIX_2_562,
+                   (FIX_3_072 - FIX_2_562), -FIX_2_562),
+  _uint64_set_pi32((1 << (DESCALE_P1 - 1)), (1 << (DESCALE_P1 - 1))),
+  _uint64_set_pi32((1 << (DESCALE_P2 - 1)), (1 << (DESCALE_P2 - 1))),
+  _uint64_set_pi8(CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE,
+                  CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE, CENTERJSAMPLE)
+};
+
+#define PW_F130_F054    get_const_value(index_PW_F130_F054)
+#define PW_F054_MF130   get_const_value(index_PW_F054_MF130)
+#define PW_MF078_F117   get_const_value(index_PW_MF078_F117)
+#define PW_F117_F078    get_const_value(index_PW_F117_F078)
+#define PW_MF060_MF089  get_const_value(index_PW_MF060_MF089)
+#define PW_MF089_F060   get_const_value(index_PW_MF089_F060)
+#define PW_MF050_MF256  get_const_value(index_PW_MF050_MF256)
+#define PW_MF256_F050   get_const_value(index_PW_MF256_F050)
+#define PD_DESCALE_P1   get_const_value(index_PD_DESCALE_P1)
+#define PD_DESCALE_P2   get_const_value(index_PD_DESCALE_P2)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON(PASS) { \
+  __m64 tmp0_3l, tmp0_3h, tmp1_2l, tmp1_2h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 z34l, z34h, z3l, z3h, z4l, z4h, z3, z4; \
+  __m64 out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h; \
+  __m64 out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; \
+  \
+  z3 = _mm_add_pi16(tmp0, tmp2); \
+  z4 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = _mm_unpacklo_pi16(z3, z4); \
+  z34h = _mm_unpackhi_pi16(z3, z4); \
+  z3l = _mm_madd_pi16(z34l, PW_MF078_F117); \
+  z3h = _mm_madd_pi16(z34h, PW_MF078_F117); \
+  z4l = _mm_madd_pi16(z34l, PW_F117_F078); \
+  z4h = _mm_madd_pi16(z34h, PW_F117_F078); \
+  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
+  \
+  tmp0_3l = _mm_unpacklo_pi16(tmp0, tmp3); \
+  tmp0_3h = _mm_unpackhi_pi16(tmp0, tmp3); \
+  \
+  tmp0l = _mm_madd_pi16(tmp0_3l, PW_MF060_MF089); \
+  tmp0h = _mm_madd_pi16(tmp0_3h, PW_MF060_MF089); \
+  tmp3l = _mm_madd_pi16(tmp0_3l, PW_MF089_F060); \
+  tmp3h = _mm_madd_pi16(tmp0_3h, PW_MF089_F060); \
+  \
+  tmp0l = _mm_add_pi32(tmp0l, z3l); \
+  tmp0h = _mm_add_pi32(tmp0h, z3h); \
+  tmp3l = _mm_add_pi32(tmp3l, z4l); \
+  tmp3h = _mm_add_pi32(tmp3h, z4h); \
+  \
+  tmp1_2l = _mm_unpacklo_pi16(tmp1, tmp2); \
+  tmp1_2h = _mm_unpackhi_pi16(tmp1, tmp2); \
+  \
+  tmp1l = _mm_madd_pi16(tmp1_2l, PW_MF050_MF256); \
+  tmp1h = _mm_madd_pi16(tmp1_2h, PW_MF050_MF256); \
+  tmp2l = _mm_madd_pi16(tmp1_2l, PW_MF256_F050); \
+  tmp2h = _mm_madd_pi16(tmp1_2h, PW_MF256_F050); \
+  \
+  tmp1l = _mm_add_pi32(tmp1l, z4l); \
+  tmp1h = _mm_add_pi32(tmp1h, z4h); \
+  tmp2l = _mm_add_pi32(tmp2l, z3l); \
+  tmp2h = _mm_add_pi32(tmp2h, z3h); \
+  \
+  /* Final output stage */ \
+  \
+  out0l = _mm_add_pi32(tmp10l, tmp3l); \
+  out0h = _mm_add_pi32(tmp10h, tmp3h); \
+  out7l = _mm_sub_pi32(tmp10l, tmp3l); \
+  out7h = _mm_sub_pi32(tmp10h, tmp3h); \
+  \
+  out0l = _mm_add_pi32(out0l, PD_DESCALE_P##PASS); \
+  out0h = _mm_add_pi32(out0h, PD_DESCALE_P##PASS); \
+  out0l = _mm_srai_pi32(out0l, DESCALE_P##PASS); \
+  out0h = _mm_srai_pi32(out0h, DESCALE_P##PASS); \
+  \
+  out7l = _mm_add_pi32(out7l, PD_DESCALE_P##PASS); \
+  out7h = _mm_add_pi32(out7h, PD_DESCALE_P##PASS); \
+  out7l = _mm_srai_pi32(out7l, DESCALE_P##PASS); \
+  out7h = _mm_srai_pi32(out7h, DESCALE_P##PASS); \
+  \
+  out0 = _mm_packs_pi32(out0l, out0h); \
+  out7 = _mm_packs_pi32(out7l, out7h); \
+  \
+  out1l = _mm_add_pi32(tmp11l, tmp2l); \
+  out1h = _mm_add_pi32(tmp11h, tmp2h); \
+  out6l = _mm_sub_pi32(tmp11l, tmp2l); \
+  out6h = _mm_sub_pi32(tmp11h, tmp2h); \
+  \
+  out1l = _mm_add_pi32(out1l, PD_DESCALE_P##PASS); \
+  out1h = _mm_add_pi32(out1h, PD_DESCALE_P##PASS); \
+  out1l = _mm_srai_pi32(out1l, DESCALE_P##PASS); \
+  out1h = _mm_srai_pi32(out1h, DESCALE_P##PASS); \
+  \
+  out6l = _mm_add_pi32(out6l, PD_DESCALE_P##PASS); \
+  out6h = _mm_add_pi32(out6h, PD_DESCALE_P##PASS); \
+  out6l = _mm_srai_pi32(out6l, DESCALE_P##PASS); \
+  out6h = _mm_srai_pi32(out6h, DESCALE_P##PASS); \
+  \
+  out1 = _mm_packs_pi32(out1l, out1h); \
+  out6 = _mm_packs_pi32(out6l, out6h); \
+  \
+  out2l = _mm_add_pi32(tmp12l, tmp1l); \
+  out2h = _mm_add_pi32(tmp12h, tmp1h); \
+  out5l = _mm_sub_pi32(tmp12l, tmp1l); \
+  out5h = _mm_sub_pi32(tmp12h, tmp1h); \
+  \
+  out2l = _mm_add_pi32(out2l, PD_DESCALE_P##PASS); \
+  out2h = _mm_add_pi32(out2h, PD_DESCALE_P##PASS); \
+  out2l = _mm_srai_pi32(out2l, DESCALE_P##PASS); \
+  out2h = _mm_srai_pi32(out2h, DESCALE_P##PASS); \
+  \
+  out5l = _mm_add_pi32(out5l, PD_DESCALE_P##PASS); \
+  out5h = _mm_add_pi32(out5h, PD_DESCALE_P##PASS); \
+  out5l = _mm_srai_pi32(out5l, DESCALE_P##PASS); \
+  out5h = _mm_srai_pi32(out5h, DESCALE_P##PASS); \
+  \
+  out2 = _mm_packs_pi32(out2l, out2h); \
+  out5 = _mm_packs_pi32(out5l, out5h); \
+  \
+  out3l = _mm_add_pi32(tmp13l, tmp0l); \
+  out3h = _mm_add_pi32(tmp13h, tmp0h); \
+  \
+  out4l = _mm_sub_pi32(tmp13l, tmp0l); \
+  out4h = _mm_sub_pi32(tmp13h, tmp0h); \
+  \
+  out3l = _mm_add_pi32(out3l, PD_DESCALE_P##PASS); \
+  out3h = _mm_add_pi32(out3h, PD_DESCALE_P##PASS); \
+  out3l = _mm_srai_pi32(out3l, DESCALE_P##PASS); \
+  out3h = _mm_srai_pi32(out3h, DESCALE_P##PASS); \
+  \
+  out4l = _mm_add_pi32(out4l, PD_DESCALE_P##PASS); \
+  out4h = _mm_add_pi32(out4h, PD_DESCALE_P##PASS); \
+  out4l = _mm_srai_pi32(out4l, DESCALE_P##PASS); \
+  out4h = _mm_srai_pi32(out4h, DESCALE_P##PASS); \
+  \
+  out3 = _mm_packs_pi32(out3l, out3h); \
+  out4 = _mm_packs_pi32(out4l, out4h); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 z23, z2, z3, z23l, z23h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l); \
+      dcval = _mm_slli_pi16(dcval, PASS1_BITS);  /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  z2 = _mm_mullo_pi16(col2l, quant2l); \
+  z3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  z23l = _mm_unpacklo_pi16(z2, z3); \
+  z23h = _mm_unpackhi_pi16(z2, z3); \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z2 = _mm_mullo_pi16(col0l, quant0l); \
+  z3 = _mm_mullo_pi16(col4l, quant4l); \
+  \
+  z23 = _mm_add_pi16(z2, z3); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(z2, z3); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp0 = _mm_mullo_pi16(col7l, quant7l); \
+  tmp1 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp2 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp3 = _mm_mullo_pi16(col1l, quant1l); \
+  \
+  DO_IDCT_COMMON(1) \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 z23, z23l, z23h; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h, row06, row17, row24, row35; \
+  __m64 col0, col1, col2, col3; \
+  __m64 tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h; \
+  __m64 tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
+  \
+  z23l = _mm_unpacklo_pi16(row2l, row6l); \
+  z23h = _mm_unpackhi_pi16(row2l, row6l); \
+  \
+  tmp3l = _mm_madd_pi16(z23l, PW_F130_F054); \
+  tmp3h = _mm_madd_pi16(z23h, PW_F130_F054); \
+  tmp2l = _mm_madd_pi16(z23l, PW_F054_MF130); \
+  tmp2h = _mm_madd_pi16(z23h, PW_F054_MF130); \
+  \
+  z23 = _mm_add_pi16(row0l, row4l); \
+  tmp0l = _mm_loadlo_pi16_f(z23); \
+  tmp0h = _mm_loadhi_pi16_f(z23); \
+  tmp0l = _mm_srai_pi32(tmp0l, (16 - CONST_BITS)); \
+  tmp0h = _mm_srai_pi32(tmp0h, (16 - CONST_BITS)); \
+  \
+  tmp10l = _mm_add_pi32(tmp0l, tmp3l); \
+  tmp10h = _mm_add_pi32(tmp0h, tmp3h); \
+  tmp13l = _mm_sub_pi32(tmp0l, tmp3l); \
+  tmp13h = _mm_sub_pi32(tmp0h, tmp3h); \
+  \
+  z23 = _mm_sub_pi16(row0l, row4l); \
+  tmp1l = _mm_loadlo_pi16_f(z23); \
+  tmp1h = _mm_loadhi_pi16_f(z23); \
+  tmp1l = _mm_srai_pi32(tmp1l, (16 - CONST_BITS)); \
+  tmp1h = _mm_srai_pi32(tmp1h, (16 - CONST_BITS)); \
+  \
+  tmp11l = _mm_add_pi32(tmp1l, tmp2l); \
+  tmp11h = _mm_add_pi32(tmp1h, tmp2h); \
+  tmp12l = _mm_sub_pi32(tmp1l, tmp2l); \
+  tmp12h = _mm_sub_pi32(tmp1h, tmp2h); \
+  \
+  /* Odd part */ \
+  \
+  tmp0 = row7l; \
+  tmp1 = row5l; \
+  tmp2 = row3l; \
+  tmp3 = row1l; \
+  \
+  DO_IDCT_COMMON(2) \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_islow_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/media/libjpeg/simd/mips64/jquanti-mmi.c b/media/libjpeg/simd/mips64/jquanti-mmi.c
new file mode 100644
index 0000000000..339002fd80
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+  __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+  \
+  rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+  rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  /* Branch-less absolute value */ \
+  rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1));  /* -1 if value < 0, */ \
+                                                /* 0 otherwise */ \
+  rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  rowlsave = rowl = _mm_add_pi16(rowl, corrl);  /* correction + roundfactor */ \
+  rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+  \
+  recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, recipl); \
+  rowh = _mm_mulhi_pi16(rowh, reciph); \
+  \
+  /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+  /* initial value (input value is never negative as we inverted it at the */ \
+  /* start of this routine) */ \
+  rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, scalel); \
+  rowh = _mm_mulhi_pi16(rowh, scaleh); \
+  \
+  /* determine if scale is negative */ \
+  scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+  scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+  \
+  /* and add input if it is */ \
+  scalel = _mm_and_si64(scalel, rowlsave); \
+  scaleh = _mm_and_si64(scaleh, rowhsave); \
+  rowl = _mm_add_pi16(rowl, scalel); \
+  rowh = _mm_add_pi16(rowh, scaleh); \
+  \
+  /* then check if negative input */ \
+  rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+  rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+  \
+  /* and add scale if it is */ \
+  rowlsave = _mm_and_si64(rowlsave, scalel); \
+  rowhsave = _mm_and_si64(rowhsave, scaleh); \
+  rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+  _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/media/libjpeg/simd/mips64/jsimd.c b/media/libjpeg/simd/mips64/jsimd.c
new file mode 100644
index 0000000000..e8f1af562b
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd.c
@@ -0,0 +1,870 @@
+/*
+ * jsimd_mips64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
+ * Copyright (C) 2015, 2018, Matthieu Darbois.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit MIPS architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+static unsigned int simd_support = ~0;
+
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "ASEs implemented", 16) != 0)
+    return 0;
+  buffer += 16;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "loongson-mmi"))
+        simd_support |= JSIMD_MMI;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if defined(__linux__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__linux__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__mips_loongson_vector_rev)
+  /* Only enable MMI by default on non-Linux platforms when the compiler flags
+   * support it. */
+  simd_support |= JSIMD_MMI;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMI");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MMI;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_c_can_null_convert(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_ycc_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_ycc_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_ycc_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_ycc_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_gray_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_gray_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_gray_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_gray_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_ycc_extrgb_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_ycc_extrgbx_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_ycc_extbgr_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_ycc_extbgrx_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_ycc_extxbgr_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_ycc_extxrgb_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_ycc_rgb_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(void)
+jsimd_c_null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                     JSAMPIMAGE output_buf, JDIMENSION output_row,
+                     int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_smooth_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_mmi(cinfo->image_width, cinfo->max_v_samp_factor,
+                            compptr->v_samp_factor, compptr->width_in_blocks,
+                            input_data, output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v2_smooth_downsample(j_compress_ptr cinfo,
+                             jpeg_component_info *compptr,
+                             JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_int_upsample(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(void)
+jsimd_int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                   JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v2_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v1_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_mmi(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_mmi(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_6x6(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_12x12(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/mips64/jsimd_mmi.h b/media/libjpeg/simd/mips64/jsimd_mmi.h
new file mode 100644
index 0000000000..5e4261c9d9
--- /dev/null
+++ b/media/libjpeg/simd/mips64/jsimd_mmi.h
@@ -0,0 +1,69 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           QingfaLiu   <liuqingfa-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jdct.h"
+#include "loongson-mmintrin.h"
+
+
+/* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU  "daddu "
+# define PTR_SLL   "dsll "
+#else
+# define PTR_ADDU  "addu "
+# define PTR_SLL   "sll "
+#endif
+
+#define SIZEOF_MMWORD  8
+#define BYTE_BIT  8
+#define WORD_BIT  16
+#define SCALEBITS  16
+
+#define _uint64_set_pi8(a, b, c, d, e, f, g, h) \
+  (((uint64_t)(uint8_t)a << 56) | \
+   ((uint64_t)(uint8_t)b << 48) | \
+   ((uint64_t)(uint8_t)c << 40) | \
+   ((uint64_t)(uint8_t)d << 32) | \
+   ((uint64_t)(uint8_t)e << 24) | \
+   ((uint64_t)(uint8_t)f << 16) | \
+   ((uint64_t)(uint8_t)g << 8)  | \
+   ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a)  _uint64_set_pi8(a, a, a, a, a, a, a, a)
+#define _uint64_set_pi16(a, b, c, d) \
+  (((uint64_t)(uint16_t)a << 48) | \
+   ((uint64_t)(uint16_t)b << 32) | \
+   ((uint64_t)(uint16_t)c << 16) | \
+   ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a)  _uint64_set_pi16(a, a, a, a)
+#define _uint64_set_pi32(a, b) \
+  (((uint64_t)(uint32_t)a << 32) | \
+   ((uint64_t)(uint32_t)b))
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
diff --git a/media/libjpeg/simd/mips64/loongson-mmintrin.h b/media/libjpeg/simd/mips64/loongson-mmintrin.h
new file mode 100644
index 0000000000..db9b35ab60
--- /dev/null
+++ b/media/libjpeg/simd/mips64/loongson-mmintrin.h
@@ -0,0 +1,1334 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#ifndef __LOONGSON_MMINTRIN_H__
+#define __LOONGSON_MMINTRIN_H__
+
+#include <stdint.h>
+
+
+#define FUNCTION_ATTRIBS \
+  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+
+
+/* Vectors are stored in 64-bit floating-point registers. */
+typedef double __m64;
+
+/* Having a 32-bit datatype allows us to use 32-bit loads in places like
+   load8888. */
+typedef float __m32;
+
+
+/********** Set Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setzero_si64(void)
+{
+  return 0.0;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
+            uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__b6 << 24) |
+                ((uint32_t)__b4 << 16) |
+                ((uint32_t)__b2 << 8) |
+                (uint32_t)__b0;
+  uint32_t hi = ((uint32_t)__b7 << 24) |
+                ((uint32_t)__b5 << 16) |
+                ((uint32_t)__b3 << 8) |
+                (uint32_t)__b1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklbh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
+{
+  __m64 ret;
+  uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
+  uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
+
+  asm("mtc1      %1, %0\n\t"
+      "mtc1      %2, $f0\n\t"
+      "punpcklhw %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (lo), "r" (hi)
+      : "$f0"
+     );
+
+  return ret;
+}
+
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set_pi32(uint32_t __i1, uint32_t __i0)
+{
+  if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  } else if (__i1 == __i0) {
+    uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
+    __m64 ret;
+
+    asm("pshufh %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+       );
+
+    return ret;
+  } else {
+    uint64_t val = ((uint64_t)__i1 << 32) |
+                   ((uint64_t)__i0 <<  0);
+
+    return *(__m64 *)&val;
+  }
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi8(uint8_t __b0)
+{
+  __m64 ret;
+
+  asm("sll    $8, %1, 8\n\t"
+      "or     %1, %1, $8\n\t"
+      "mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__b0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi16(uint16_t __h0)
+{
+  __m64 ret;
+
+  asm("mtc1   %1, %0\n\t"
+      "mtc1   $0, $f0\n\t"
+      "pshufh %0, %0, $f0\n\t"
+      : "=f" (ret)
+      : "r" (__h0)
+      : "$8", "$f0"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_set1_pi32(unsigned __i0)
+{
+  return _mm_set_pi32(__i0, __i0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
+             uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
+{
+  return _mm_set_pi8(__h7, __h6, __h5, __h4,
+                     __h3, __h2, __h1, __h0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
+{
+  return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_setr_pi32(uint32_t __i0, uint32_t __i1)
+{
+  return _mm_set_pi32(__i1, __i0);
+}
+
+
+/********** Arithmetic Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_add_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("paddush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_avg_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pavgh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaddhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_max_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmaxub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_min_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pminub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline int FUNCTION_ATTRIBS
+_mm_movemask_pi8(__m64 __m1)
+{
+  int ret;
+
+  asm("pmovmskb %0, %1\n\t"
+      : "=r" (ret)
+      : "y" (__m1)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mulhi_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmulhuh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmullh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_mul_pu32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pmuluw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sad_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psadbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_asub_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pasubub %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_biadd_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("biadd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_sub_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubsh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubusb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("psubush %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Logical Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("and %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("andn %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si32(__m32 __m1, __m32 __m2)
+{
+  __m32 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("or %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("xor %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Shift Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllh  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psllw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_slli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsll  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrlw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srli_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsrl  %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi16(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psrah %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_pi32(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("psraw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_srai_si64(__m64 __m, int64_t __count)
+{
+  __m64 ret;
+
+  asm("dsra %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__count)
+     );
+
+  return ret;
+}
+
+
+/********** Conversion Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+to_m64(uint64_t x)
+{
+  return *(__m64 *)&x;
+}
+
+extern __inline uint64_t FUNCTION_ATTRIBS
+to_uint64(__m64 x)
+{
+  return *(uint64_t *)&x;
+}
+
+
+/********** Comparison Intrinsics **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpeqw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpgtw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmplth %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_cmplt_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("pcmpltw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+/********** Miscellaneous Operations **********/
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsshb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packsswh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("packushb %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_extract_pi16(__m64 __m, int64_t __pos)
+{
+  __m64 ret;
+
+  asm("pextrh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__pos)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
+{
+  __m64 ret;
+
+  switch (__pos) {
+  case 0:
+
+    asm("pinsrh_0 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 1:
+
+    asm("pinsrh_1 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  case 2:
+
+    asm("pinsrh_2 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+
+  case 3:
+
+    asm("pinsrh_3 %0, %1, %2\n\t"
+        : "=f" (ret)
+        : "f" (__m1), "f" (__m2), "i" (__pos)
+       );
+
+    break;
+  }
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_shuffle_pi16(__m64 __m, int64_t __n)
+{
+  __m64 ret;
+
+  asm("pshufh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m), "f" (*(__m64 *)&__n)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpckhwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
+   which preserves the data. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
+   datatype, which allows load8888 to use 32-bit loads. */
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklbh %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklhw %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
+{
+  __m64 ret;
+
+  asm("punpcklwd %0, %1, %2\n\t"
+      : "=f" (ret)
+      : "f" (__m1), "f" (__m2)
+     );
+
+  return ret;
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_pi32(__m32 *dest, __m64 src)
+{
+  src = _mm_packs_pu16(src, _mm_setzero_si64());
+
+  asm("swc1 %1, %0\n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_store_si64(__m64 *dest, __m64 src)
+{
+  asm("sdc1 %1, %0 \n\t"
+      : "=m" (*dest)
+      : "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7(%0) \n\t"
+      "gssdrc1 %1, 0(%0) \n\t"
+      :
+      : "r" (dest), "f" (src)
+      : "memory"
+     );
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si32(const __m32 *src)
+{
+  __m32 ret;
+
+  asm("lwc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_load_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("ldc1 %0, %1\n\t"
+      : "=f" (ret)
+      : "m" (*src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadu_si64(const __m64 *src)
+{
+  __m64 ret;
+
+  asm("gsldlc1 %0,  7(%1)\n\t"
+      "gsldrc1 %0,  0(%1)\n\t"
+      : "=f" (ret)
+      : "r" (src)
+      : "memory"
+     );
+
+  return ret;
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8(const uint32_t *src)
+{
+  return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi8_f(__m64 src)
+{
+  return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi8_f(__m64 src)
+{
+  return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16(__m64 src)
+{
+  return _mm_unpacklo_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadlo_pi16_f(__m64 src)
+{
+  return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16(__m64 src)
+{
+  return _mm_unpackhi_pi16(src, _mm_setzero_si64());
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_loadhi_pi16_f(__m64 src)
+{
+  return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
+}
+
+extern __inline __m64 FUNCTION_ATTRIBS
+_mm_expand_alpha_rev(__m64 pixel)
+{
+  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
+}
+
+#endif  /* __LOONGSON_MMINTRIN_H__ */
diff --git a/media/libjpeg/simd/nasm/jcolsamp.inc b/media/libjpeg/simd/nasm/jcolsamp.inc
new file mode 100644
index 0000000000..6f6d7f29d1
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jcolsamp.inc
@@ -0,0 +1,135 @@
+;
+; jcolsamp.inc - private declarations for color conversion & up/downsampling
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+
+; --------------------------------------------------------------------------
+
+; pseudo-resisters to make ordering of RGB configurable
+;
+%if RGB_RED == 0
+%define mmA  mm0
+%define mmB  mm1
+%define xmmA  xmm0
+%define xmmB  xmm1
+%define ymmA  ymm0
+%define ymmB  ymm1
+%elif RGB_GREEN == 0
+%define mmA  mm2
+%define mmB  mm3
+%define xmmA  xmm2
+%define xmmB  xmm3
+%define ymmA  ymm2
+%define ymmB  ymm3
+%elif RGB_BLUE == 0
+%define mmA  mm4
+%define mmB  mm5
+%define xmmA  xmm4
+%define xmmB  xmm5
+%define ymmA  ymm4
+%define ymmB  ymm5
+%else
+%define mmA  mm6
+%define mmB  mm7
+%define xmmA  xmm6
+%define xmmB  xmm7
+%define ymmA  ymm6
+%define ymmB  ymm7
+%endif
+
+%if RGB_RED == 1
+%define mmC  mm0
+%define mmD  mm1
+%define xmmC  xmm0
+%define xmmD  xmm1
+%define ymmC  ymm0
+%define ymmD  ymm1
+%elif RGB_GREEN == 1
+%define mmC  mm2
+%define mmD  mm3
+%define xmmC  xmm2
+%define xmmD  xmm3
+%define ymmC  ymm2
+%define ymmD  ymm3
+%elif RGB_BLUE == 1
+%define mmC  mm4
+%define mmD  mm5
+%define xmmC  xmm4
+%define xmmD  xmm5
+%define ymmC  ymm4
+%define ymmD  ymm5
+%else
+%define mmC  mm6
+%define mmD  mm7
+%define xmmC  xmm6
+%define xmmD  xmm7
+%define ymmC  ymm6
+%define ymmD  ymm7
+%endif
+
+%if RGB_RED == 2
+%define mmE  mm0
+%define mmF  mm1
+%define xmmE  xmm0
+%define xmmF  xmm1
+%define ymmE  ymm0
+%define ymmF  ymm1
+%elif RGB_GREEN == 2
+%define mmE  mm2
+%define mmF  mm3
+%define xmmE  xmm2
+%define xmmF  xmm3
+%define ymmE  ymm2
+%define ymmF  ymm3
+%elif RGB_BLUE == 2
+%define mmE  mm4
+%define mmF  mm5
+%define xmmE  xmm4
+%define xmmF  xmm5
+%define ymmE  ymm4
+%define ymmF  ymm5
+%else
+%define mmE  mm6
+%define mmF  mm7
+%define xmmE  xmm6
+%define xmmF  xmm7
+%define ymmE  ymm6
+%define ymmF  ymm7
+%endif
+
+%if RGB_RED == 3
+%define mmG  mm0
+%define mmH  mm1
+%define xmmG  xmm0
+%define xmmH  xmm1
+%define ymmG  ymm0
+%define ymmH  ymm1
+%elif RGB_GREEN == 3
+%define mmG  mm2
+%define mmH  mm3
+%define xmmG  xmm2
+%define xmmH  xmm3
+%define ymmG  ymm2
+%define ymmH  ymm3
+%elif RGB_BLUE == 3
+%define mmG  mm4
+%define mmH  mm5
+%define xmmG  xmm4
+%define xmmH  xmm5
+%define ymmG  ymm4
+%define ymmH  ymm5
+%else
+%define mmG  mm6
+%define mmH  mm7
+%define xmmG  xmm6
+%define xmmH  xmm7
+%define ymmG  ymm6
+%define ymmH  ymm7
+%endif
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jdct.inc b/media/libjpeg/simd/nasm/jdct.inc
similarity index 63%
rename from media/libjpeg/simd/jdct.inc
rename to media/libjpeg/simd/nasm/jdct.inc
index b9761071e9..9192f66f0c 100644
--- a/media/libjpeg/simd/jdct.inc
+++ b/media/libjpeg/simd/nasm/jdct.inc
@@ -2,12 +2,11 @@
 ; jdct.inc - private declarations for forward & reverse DCT subsystems
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2018, D. R. Commander.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
-;
-; [TAB8]
 
 ; Each IDCT routine is responsible for range-limiting its results and
 ; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
@@ -17,11 +16,16 @@
 ;
 %define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
 
-%define ROW(n,b,s)              ((b)+(n)*(s))
-%define COL(n,b,s)              ((b)+(n)*(s)*DCTSIZE)
+%define ROW(n, b, s)  ((b) + (n) * (s))
+%define COL(n, b, s)  ((b) + (n) * (s) * DCTSIZE)
 
-%define DWBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
-%define MMBLOCK(m,n,b,s)        ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
-%define XMMBLOCK(m,n,b,s)       ((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
+%define DWBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_DWORD)
+%define MMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_MMWORD)
+%define XMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_XMMWORD)
+%define YMMBLOCK(m, n, b, s) \
+  ((b) + (m) * DCTSIZE * (s) + (n) * SIZEOF_YMMWORD)
 
 ; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jsimdcfg.inc b/media/libjpeg/simd/nasm/jsimdcfg.inc
similarity index 97%
rename from media/libjpeg/simd/jsimdcfg.inc
rename to media/libjpeg/simd/nasm/jsimdcfg.inc
index 9d4aedec9e..667024a5f9 100644
--- a/media/libjpeg/simd/jsimdcfg.inc
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc
@@ -90,5 +90,4 @@
 %define JSIMD_3DNOW 0x02
 %define JSIMD_SSE 0x04
 %define JSIMD_SSE2 0x08
-; Short forms of external names for systems with brain-damaged linkers.
-;
+%define JSIMD_AVX2 0x80
diff --git a/media/libjpeg/simd/nasm/jsimdcfg.inc.h b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
new file mode 100644
index 0000000000..bf2a45ad50
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdcfg.inc.h
@@ -0,0 +1,133 @@
+/*
+ * This file generates the include file for the assembly
+ * implementations by abusing the C preprocessor.
+ *
+ * Note: Some things are manually defined as they need to
+ * be mapped to NASM types.
+ */
+
+;
+; Automatically generated include file from jsimdcfg.inc.h
+;
+
+#define JPEG_INTERNALS
+
+#include "../jpeglib.h"
+#include "../jconfig.h"
+#include "../jmorecfg.h"
+#include "jsimd.h"
+
+;
+; -- jpeglib.h
+;
+
+%define _cpp_protection_DCTSIZE   DCTSIZE
+%define _cpp_protection_DCTSIZE2  DCTSIZE2
+
+;
+; -- jmorecfg.h
+;
+
+%define _cpp_protection_RGB_RED             RGB_RED
+%define _cpp_protection_RGB_GREEN           RGB_GREEN
+%define _cpp_protection_RGB_BLUE            RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE       RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGB_RED         EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN       EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE        EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE   EXT_RGB_PIXELSIZE
+
+%define _cpp_protection_EXT_RGBX_RED        EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN      EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE       EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE  EXT_RGBX_PIXELSIZE
+
+%define _cpp_protection_EXT_BGR_RED         EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN       EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE        EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE   EXT_BGR_PIXELSIZE
+
+%define _cpp_protection_EXT_BGRX_RED        EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN      EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE       EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE  EXT_BGRX_PIXELSIZE
+
+%define _cpp_protection_EXT_XBGR_RED        EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN      EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE       EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE  EXT_XBGR_PIXELSIZE
+
+%define _cpp_protection_EXT_XRGB_RED        EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN      EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE       EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+
+%define RGBX_FILLER_0XFF  1
+
+; Representation of a single sample (pixel element value).
+; On this SIMD implementation, this must be 'unsigned char'.
+;
+
+%define JSAMPLE            byte            ; unsigned char
+%define SIZEOF_JSAMPLE     SIZEOF_BYTE     ; sizeof(JSAMPLE)
+
+%define _cpp_protection_CENTERJSAMPLE  CENTERJSAMPLE
+
+; Representation of a DCT frequency coefficient.
+; On this SIMD implementation, this must be 'short'.
+;
+%define JCOEF              word            ; short
+%define SIZEOF_JCOEF       SIZEOF_WORD     ; sizeof(JCOEF)
+
+; Datatype used for image dimensions.
+; On this SIMD implementation, this must be 'unsigned int'.
+;
+%define JDIMENSION         dword           ; unsigned int
+%define SIZEOF_JDIMENSION  SIZEOF_DWORD    ; sizeof(JDIMENSION)
+
+%define JSAMPROW           POINTER         ; JSAMPLE *     (jpeglib.h)
+%define JSAMPARRAY         POINTER         ; JSAMPROW *    (jpeglib.h)
+%define JSAMPIMAGE         POINTER         ; JSAMPARRAY *  (jpeglib.h)
+%define JCOEFPTR           POINTER         ; JCOEF *       (jpeglib.h)
+%define SIZEOF_JSAMPROW    SIZEOF_POINTER  ; sizeof(JSAMPROW)
+%define SIZEOF_JSAMPARRAY  SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
+%define SIZEOF_JSAMPIMAGE  SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
+%define SIZEOF_JCOEFPTR    SIZEOF_POINTER  ; sizeof(JCOEFPTR)
+
+;
+; -- jdct.h
+;
+
+; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
+; the DCT is to be performed in-place in that buffer.
+; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
+;
+%define DCTELEM                 word         ; short
+%define SIZEOF_DCTELEM          SIZEOF_WORD  ; sizeof(DCTELEM)
+
+%define FAST_FLOAT              FP32         ; float
+%define SIZEOF_FAST_FLOAT       SIZEOF_FP32  ; sizeof(FAST_FLOAT)
+
+; To maximize parallelism, Type MULTIPLIER is changed to short.
+;
+%define ISLOW_MULT_TYPE         word         ; must be short
+%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD  ; sizeof(ISLOW_MULT_TYPE)
+
+%define IFAST_MULT_TYPE         word         ; must be short
+%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD  ; sizeof(IFAST_MULT_TYPE)
+%define IFAST_SCALE_BITS        2            ; fractional bits in scale factors
+
+%define FLOAT_MULT_TYPE         FP32         ; must be float
+%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32  ; sizeof(FLOAT_MULT_TYPE)
+
+;
+; -- jsimd.h
+;
+
+%define _cpp_protection_JSIMD_NONE   JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX    JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW  JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE    JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2   JSIMD_SSE2
+%define _cpp_protection_JSIMD_AVX2   JSIMD_AVX2
diff --git a/media/libjpeg/simd/nasm/jsimdext.inc b/media/libjpeg/simd/nasm/jsimdext.inc
new file mode 100644
index 0000000000..e8d50b0349
--- /dev/null
+++ b/media/libjpeg/simd/nasm/jsimdext.inc
@@ -0,0 +1,520 @@
+;
+; jsimdext.inc - common declarations
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
+; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
+;
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+;
+; This software is provided 'as-is', without any express or implied
+; warranty.  In no event will the authors be held liable for any damages
+; arising from the use of this software.
+;
+; Permission is granted to anyone to use this software for any purpose,
+; including commercial applications, and to alter it and redistribute it
+; freely, subject to the following restrictions:
+;
+; 1. The origin of this software must not be misrepresented; you must not
+;    claim that you wrote the original software. If you use this software
+;    in a product, an acknowledgment in the product documentation would be
+;    appreciated but is not required.
+; 2. Altered source versions must be plainly marked as such, and must not be
+;    misrepresented as being the original software.
+; 3. This notice may not be removed or altered from any source distribution.
+
+; ==========================================================================
+;  System-dependent configurations
+
+%ifdef WIN32    ; ----(nasm -fwin32 -DWIN32 ...)--------
+; * Microsoft Visual C++
+; * MinGW (Minimalist GNU for Windows)
+; * CygWin
+; * LCC-Win32
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT   .text  align=32
+%define SEG_CONST  .rdata align=32
+%else
+%define SEG_TEXT   .text  align=32 public use32 class=CODE
+%define SEG_CONST  .rdata align=32 public use32 class=CONST
+%endif
+
+%elifdef WIN64  ; ----(nasm -fwin64 -DWIN64 ...)--------
+; * Microsoft Visual C++
+
+; -- segment definition --
+;
+%ifdef __YASM_VER__
+%define SEG_TEXT    .text  align=32
+%define SEG_CONST   .rdata align=32
+%else
+%define SEG_TEXT    .text  align=32 public use64 class=CODE
+%define SEG_CONST   .rdata align=32 public use64 class=CONST
+%endif
+%define EXTN(name)  name                ; foo() -> foo
+
+%elifdef OBJ32  ; ----(nasm -fobj -DOBJ32 ...)----------
+; * Borland C++ (Win32)
+
+; -- segment definition --
+;
+%define SEG_TEXT   _text align=32 public use32 class=CODE
+%define SEG_CONST  _data align=32 public use32 class=DATA
+
+%elifdef ELF    ; ----(nasm -felf[64] -DELF ...)------------
+; * Linux
+; * *BSD family Unix using elf format
+; * Unix System V, including Solaris x86, UnixWare and SCO Unix
+
+; mark stack as non-executable
+section .note.GNU-stack noalloc noexec nowrite progbits
+
+; -- segment definition --
+;
+%ifdef __x86_64__
+%define SEG_TEXT   .text   progbits align=32
+%define SEG_CONST  .rodata progbits align=32
+%else
+%define SEG_TEXT   .text   progbits alloc exec   nowrite align=32
+%define SEG_CONST  .rodata progbits alloc noexec nowrite align=32
+%endif
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_  ; ELF supports PIC
+%define EXTN(name)  name                   ; foo() -> foo
+
+%elifdef AOUT   ; ----(nasm -faoutb/aout -DAOUT ...)----
+; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
+; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+; To make the code position-independent, append -DPIC to the commandline
+;
+%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_  ; BSD-style a.out supports PIC
+
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text  ;align=32     ; nasm doesn't accept align=32. why?
+%define SEG_CONST  .rodata align=32
+
+; The generation of position-independent code (PIC) is the default on Darwin.
+;
+%define PIC
+%define GOT_SYMBOL  _MACHO_PIC_         ; Mach-O style code-relative addressing
+
+%else           ; ----(Other case)----------------------
+
+; -- segment definition --
+;
+%define SEG_TEXT   .text
+%define SEG_CONST  .data
+
+%endif          ; ----------------------------------------------
+
+; ==========================================================================
+
+; --------------------------------------------------------------------------
+;  Common types
+;
+%ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
+%define POINTER         qword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
+%define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resq
+%define dp              dq
+%define raxp            rax
+%define rbxp            rbx
+%define rcxp            rcx
+%define rdxp            rdx
+%define rsip            rsi
+%define rdip            rdi
+%define rbpp            rbp
+%define rspp            rsp
+%define r8p             r8
+%define r9p             r9
+%define r10p            r10
+%define r11p            r11
+%define r12p            r12
+%define r13p            r13
+%define r14p            r14
+%define r15p            r15
+%endif
+%endif
+%ifndef raxp
+%define POINTER         dword           ; general pointer type
+%define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
+%define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resd
+%define dp              dd
+; x86_64 ILP32 ABI (x32)
+%define raxp            eax
+%define rbxp            ebx
+%define rcxp            ecx
+%define rdxp            edx
+%define rsip            esi
+%define rdip            edi
+%define rbpp            ebp
+%define rspp            esp
+%define r8p             r8d
+%define r9p             r9d
+%define r10p            r10d
+%define r11p            r11d
+%define r12p            r12d
+%define r13p            r13d
+%define r14p            r14d
+%define r15p            r15d
+%endif
+
+%define INT             dword           ; signed integer type
+%define SIZEOF_INT      SIZEOF_DWORD    ; sizeof(INT)
+%define INT_BIT         DWORD_BIT       ; sizeof(INT)*BYTE_BIT
+
+%define FP32            dword           ; IEEE754 single
+%define SIZEOF_FP32     SIZEOF_DWORD    ; sizeof(FP32)
+%define FP32_BIT        DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
+
+%define MMWORD          qword           ; int64  (MMX register)
+%define SIZEOF_MMWORD   SIZEOF_QWORD    ; sizeof(MMWORD)
+%define MMWORD_BIT      QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
+
+; NASM is buggy and doesn't properly handle operand sizes for SSE
+; instructions, so for now we have to define XMMWORD as blank.
+%define XMMWORD                         ; int128 (SSE register)
+%define SIZEOF_XMMWORD  SIZEOF_OWORD    ; sizeof(XMMWORD)
+%define XMMWORD_BIT     OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
+
+%define YMMWORD                         ; int256 (AVX register)
+%define SIZEOF_YMMWORD  SIZEOF_YWORD    ; sizeof(YMMWORD)
+%define YMMWORD_BIT     YWORD_BIT       ; sizeof(YMMWORD)*BYTE_BIT
+
+; Similar hacks for when we load a dword or MMWORD into an xmm# register
+%define XMM_DWORD
+%define XMM_MMWORD
+
+%define SIZEOF_BYTE   1                 ; sizeof(byte)
+%define SIZEOF_WORD   2                 ; sizeof(word)
+%define SIZEOF_DWORD  4                 ; sizeof(dword)
+%define SIZEOF_QWORD  8                 ; sizeof(qword)
+%define SIZEOF_OWORD  16                ; sizeof(oword)
+%define SIZEOF_YWORD  32                ; sizeof(yword)
+
+%define BYTE_BIT      8                 ; CHAR_BIT in C
+%define WORD_BIT      16                ; sizeof(word)*BYTE_BIT
+%define DWORD_BIT     32                ; sizeof(dword)*BYTE_BIT
+%define QWORD_BIT     64                ; sizeof(qword)*BYTE_BIT
+%define OWORD_BIT     128               ; sizeof(oword)*BYTE_BIT
+%define YWORD_BIT     256               ; sizeof(yword)*BYTE_BIT
+
+; --------------------------------------------------------------------------
+;  External Symbol Name
+;
+%ifndef EXTN
+%define EXTN(name)  _ %+ name           ; foo() -> _foo
+%endif
+
+; --------------------------------------------------------------------------
+;  Hidden symbols
+;
+%ifdef ELF      ; ----(nasm -felf[64] -DELF ...)--------
+%define GLOBAL_FUNCTION(name)  global EXTN(name):function hidden
+%define GLOBAL_DATA(name)      global EXTN(name):data hidden
+%elifdef MACHO  ; ----(nasm -fmacho -DMACHO ...)--------
+%ifdef __YASM_VER__
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%else
+%if __NASM_VERSION_ID__ >= 0x020E0000
+%define GLOBAL_FUNCTION(name)  global EXTN(name):private_extern
+%define GLOBAL_DATA(name)      global EXTN(name):private_extern
+%endif
+%endif
+%endif
+
+%ifndef GLOBAL_FUNCTION
+%define GLOBAL_FUNCTION(name)  global EXTN(name)
+%endif
+%ifndef GLOBAL_DATA
+%define GLOBAL_DATA(name)      global EXTN(name)
+%endif
+
+; --------------------------------------------------------------------------
+;  Macros for position-independent code (PIC) support
+;
+%ifndef GOT_SYMBOL
+%undef PIC
+%endif
+
+%ifdef PIC  ; -------------------------------------------
+
+%ifidn GOT_SYMBOL, _MACHO_PIC_  ; --------------------
+
+; At present, nasm doesn't seem to support PIC generation for Mach-O.
+; The PIC support code below is a little tricky.
+
+    SECTION     SEG_CONST
+const_base:
+
+%define GOTOFF(got, sym)  (got) + (sym) - const_base
+
+%imacro get_GOT 1
+    ; NOTE: this macro destroys ecx resister.
+    call        %%geteip
+    add         ecx, byte (%%ref - $)
+    jmp         short %%adjust
+%%geteip:
+    mov         ecx, POINTER [esp]
+    ret
+%%adjust:
+    push        ebp
+    xor         ebp, ebp                ; ebp = 0
+%ifidni %1, ebx  ; (%1 == ebx)
+    ; db 0x8D,0x9C + jmp near const_base =
+    ;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
+    db          0x8D, 0x9C              ; 8D,9C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+%else  ; (%1 != ebx)
+    ; db 0x8D,0x8C + jmp near const_base =
+    ;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
+    db          0x8D, 0x8C              ; 8D,8C
+    jmp         near const_base         ; E9,(const_base-%%ref)
+%%ref:
+    mov         %1, ecx
+%endif  ; (%1 == ebx)
+    pop         ebp
+%endmacro
+
+%else     ; GOT_SYMBOL != _MACHO_PIC_ ----------------
+
+%define GOTOFF(got, sym)  (got) + (sym) wrt ..gotoff
+
+%imacro get_GOT 1
+    extern      GOT_SYMBOL
+    call        %%geteip
+    add         %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
+    jmp         short %%done
+%%geteip:
+    mov         %1, POINTER [esp]
+    ret
+%%done:
+%endmacro
+
+%endif    ; GOT_SYMBOL == _MACHO_PIC_ ----------------
+
+%imacro pushpic 1.nolist
+    push        %1
+%endmacro
+%imacro poppic  1.nolist
+    pop         %1
+%endmacro
+%imacro movpic  2.nolist
+    mov         %1, %2
+%endmacro
+
+%else    ; !PIC -----------------------------------------
+
+%define GOTOFF(got, sym)  (sym)
+
+%imacro get_GOT 1.nolist
+%endmacro
+%imacro pushpic 1.nolist
+%endmacro
+%imacro poppic  1.nolist
+%endmacro
+%imacro movpic  2.nolist
+%endmacro
+
+%endif   ;  PIC -----------------------------------------
+
+; --------------------------------------------------------------------------
+;  Align the next instruction on {2,4,8,16,..}-byte boundary.
+;  ".balign n,,m" in GNU as
+;
+%define MSKLE(x, y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
+%define FILLB(b, n)  (($$-(b)) & ((n)-1))
+
+%imacro alignx 1-2.nolist 0xFFFF
+%%bs: \
+  times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
+        db 0x90                                      ; nop
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
+        db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00  ; lea ebx,[ebx+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
+        db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00  ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
+        db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00        ; lea ebp,[ebp+0x00000000]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
+        db 0x8D, 0x6C, 0x25, 0x00                    ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
+        db 0x8D, 0x6D, 0x00                          ; lea ebp,[ebp+0x00]
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
+        db 0x8B, 0xED                                ; mov ebp,ebp
+  times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
+        db 0x90                                      ; nop
+%endmacro
+
+; Align the next data on {2,4,8,16,..}-byte boundary.
+;
+%imacro alignz 1.nolist
+    align       %1, db 0                ; filling zeros
+%endmacro
+
+%ifdef __x86_64__
+
+%ifdef WIN64
+
+%imacro collect_args 1
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm6
+    sub         rsp, SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp], xmm7
+    mov         r10, rcx
+%if %1 > 1
+    mov         r11, rdx
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, r8
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, r9
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, [rax+48]
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, [rax+56]
+%endif
+    push        rsi
+    push        rdi
+%endmacro
+
+%imacro uncollect_args 1
+    pop         rdi
+    pop         rsi
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+    movaps      xmm7, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+    movaps      xmm6, XMMWORD [rsp]
+    add         rsp, SIZEOF_XMMWORD
+%endmacro
+
+%imacro push_xmm 1
+    sub         rsp, %1 * SIZEOF_XMMWORD
+    movaps      XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
+%if %1 > 1
+    movaps      XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
+%endif
+%if %1 > 2
+    movaps      XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
+%endif
+%if %1 > 3
+    movaps      XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
+%endif
+%endmacro
+
+%imacro pop_xmm 1
+    movaps      xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
+%if %1 > 1
+    movaps      xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
+%endif
+%if %1 > 2
+    movaps      xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
+%endif
+%if %1 > 3
+    movaps      xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
+%endif
+    add         rsp, %1 * SIZEOF_XMMWORD
+%endmacro
+
+%else
+
+%imacro collect_args 1
+    push        r10
+    mov         r10, rdi
+%if %1 > 1
+    push        r11
+    mov         r11, rsi
+%endif
+%if %1 > 2
+    push        r12
+    mov         r12, rdx
+%endif
+%if %1 > 3
+    push        r13
+    mov         r13, rcx
+%endif
+%if %1 > 4
+    push        r14
+    mov         r14, r8
+%endif
+%if %1 > 5
+    push        r15
+    mov         r15, r9
+%endif
+%endmacro
+
+%imacro uncollect_args 1
+%if %1 > 5
+    pop         r15
+%endif
+%if %1 > 4
+    pop         r14
+%endif
+%if %1 > 3
+    pop         r13
+%endif
+%if %1 > 2
+    pop         r12
+%endif
+%if %1 > 1
+    pop         r11
+%endif
+    pop         r10
+%endmacro
+
+%imacro push_xmm 1
+%endmacro
+
+%imacro pop_xmm 1
+%endmacro
+
+%endif
+
+%endif
+
+; --------------------------------------------------------------------------
+;  Defines picked up from the C headers
+;
+%include "jsimdcfg.inc"
+
+; --------------------------------------------------------------------------
diff --git a/media/libjpeg/simd/jccolext-altivec.c b/media/libjpeg/simd/powerpc/jccolext-altivec.c
similarity index 95%
rename from media/libjpeg/simd/jccolext-altivec.c
rename to media/libjpeg/simd/powerpc/jccolext-altivec.c
index 849825eb06..170f90ff80 100644
--- a/media/libjpeg/simd/jccolext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jccolext-altivec.c
@@ -24,9 +24,9 @@
 /* This file is included by jccolor-altivec.c */
 
 
-void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
-                                    JSAMPIMAGE output_buf,
-                                    JDIMENSION output_row, int num_rows)
+void jsimd_rgb_ycc_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                   JSAMPIMAGE output_buf,
+                                   JDIMENSION output_row, int num_rows)
 {
   JSAMPROW inptr, outptr0, outptr1, outptr2;
   int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -35,13 +35,13 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
 #endif
   unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
 
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
     rgbg0, rgbg1, rgbg2, rgbg3, y, cb, cr;
 #if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3 = {0};
+  __vector unsigned char rgb3 = { 0 };
 #endif
 #if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4 = {0};
+  __vector unsigned char rgb4 = { 0 };
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short yl, yh, crl, crh, cbl, cbh;
@@ -57,9 +57,11 @@ void jsimd_rgb_ycc_convert_altivec (JDIMENSION img_width, JSAMPARRAY input_buf,
     pd_onehalfm1_cj = { __4X(ONE_HALF - 1 + (CENTERJSAMPLE << SCALEBITS)) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/powerpc/jccolor-altivec.c b/media/libjpeg/simd/powerpc/jccolor-altivec.c
new file mode 100644
index 0000000000..d670dbcda3
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jccolor-altivec.c
@@ -0,0 +1,116 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> YCC CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_081  5329                 /* FIX(0.08131) */
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_168  11059                /* FIX(0.16874) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_331  21709                /* FIX(0.33126) */
+#define F_0_418  27439                /* FIX(0.41869) */
+#define F_0_500  32768                /* FIX(0.50000) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extrgbx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extbgrx_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxbgr_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_ycc_convert_altivec  jsimd_extxrgb_ycc_convert_altivec
+#include "jccolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_ycc_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jcgray-altivec.c b/media/libjpeg/simd/powerpc/jcgray-altivec.c
new file mode 100644
index 0000000000..a11a7e7021
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcgray-altivec.c
@@ -0,0 +1,111 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_114  7471                 /* FIX(0.11400) */
+#define F_0_250  16384                /* FIX(0.25000) */
+#define F_0_299  19595                /* FIX(0.29900) */
+#define F_0_587  38470                /* FIX(0.58700) */
+#define F_0_337  (F_0_587 - F_0_250)  /* FIX(0.58700) - FIX(0.25000) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+
+#define RGBG_INDEX0 \
+  {  0,  1,  3,  4,  6,  7,  9, 10,  2,  1,  5,  4,  8,  7, 11, 10 }
+#define RGBG_INDEX1 \
+  { 12, 13, 15, 16, 18, 19, 21, 22, 14, 13, 17, 16, 20, 19, 23, 22 }
+#define RGBG_INDEX2 \
+  {  8,  9, 11, 12, 14, 15, 17, 18, 10,  9, 13, 12, 16, 15, 19, 18 }
+#define RGBG_INDEX3 \
+  {  4,  5,  7,  8, 10, 11, 13, 14,  6,  5,  9,  8, 12, 11, 15, 14 }
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGBG_INDEX \
+  {  0,  1,  4,  5,  8,  9, 12, 13,  2,  1,  6,  5, 10,  9, 14, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extrgbx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGBG_INDEX0 \
+  {  2,  1,  5,  4,  8,  7, 11, 10,  0,  1,  3,  4,  6,  7,  9, 10 }
+#define RGBG_INDEX1 \
+  { 14, 13, 17, 16, 20, 19, 23, 22, 12, 13, 15, 16, 18, 19, 21, 22 }
+#define RGBG_INDEX2 \
+  { 10,  9, 13, 12, 16, 15, 19, 18,  8,  9, 11, 12, 14, 15, 17, 18 }
+#define RGBG_INDEX3 \
+  {  6,  5,  9,  8, 12, 11, 15, 14,  4,  5,  7,  8, 10, 11, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX0
+#undef RGBG_INDEX1
+#undef RGBG_INDEX2
+#undef RGBG_INDEX3
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGBG_INDEX \
+  {  2,  1,  6,  5, 10,  9, 14, 13,  0,  1,  4,  5,  8,  9, 12, 13 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extbgrx_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGBG_INDEX \
+  {  3,  2,  7,  6, 11, 10, 15, 14,  1,  2,  5,  6,  9, 10, 13, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxbgr_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGBG_INDEX \
+  {  1,  2,  5,  6,  9, 10, 13, 14,  3,  2,  7,  6, 11, 10, 15, 14 }
+#define jsimd_rgb_gray_convert_altivec  jsimd_extxrgb_gray_convert_altivec
+#include "jcgryext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGBG_INDEX
+#undef jsimd_rgb_gray_convert_altivec
diff --git a/media/libjpeg/simd/jcgryext-altivec.c b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
similarity index 93%
rename from media/libjpeg/simd/jcgryext-altivec.c
rename to media/libjpeg/simd/powerpc/jcgryext-altivec.c
index 7f8232bb24..b280cbbded 100644
--- a/media/libjpeg/simd/jcgryext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jcgryext-altivec.c
@@ -24,10 +24,9 @@
 /* This file is included by jcgray-altivec.c */
 
 
-void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
-                                     JSAMPARRAY input_buf,
-                                     JSAMPIMAGE output_buf,
-                                     JDIMENSION output_row, int num_rows)
+void jsimd_rgb_gray_convert_altivec(JDIMENSION img_width, JSAMPARRAY input_buf,
+                                    JSAMPIMAGE output_buf,
+                                    JDIMENSION output_row, int num_rows)
 {
   JSAMPROW inptr, outptr;
   int pitch = img_width * RGB_PIXELSIZE, num_cols;
@@ -36,13 +35,13 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
   unsigned char __attribute__((aligned(16))) tmpbuf[RGB_PIXELSIZE * 16];
 #endif
 
-  __vector unsigned char rgb0, rgb1 = {0}, rgb2 = {0},
+  __vector unsigned char rgb0, rgb1 = { 0 }, rgb2 = { 0 },
     rgbg0, rgbg1, rgbg2, rgbg3, y;
 #if __BIG_ENDIAN__ || RGB_PIXELSIZE == 4
-  __vector unsigned char rgb3 = {0};
+  __vector unsigned char rgb3 = { 0 };
 #endif
 #if __BIG_ENDIAN__ && RGB_PIXELSIZE == 4
-  __vector unsigned char rgb4 = {0};
+  __vector unsigned char rgb4 = { 0 };
 #endif
   __vector short rg0, rg1, rg2, rg3, bg0, bg1, bg2, bg3;
   __vector unsigned short yl, yh;
@@ -54,9 +53,11 @@ void jsimd_rgb_gray_convert_altivec (JDIMENSION img_width,
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      { 0, 1, 4, 5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/jcsample-altivec.c b/media/libjpeg/simd/powerpc/jcsample-altivec.c
similarity index 84%
rename from media/libjpeg/simd/jcsample-altivec.c
rename to media/libjpeg/simd/powerpc/jcsample-altivec.c
index 11609d9dab..6e25b8db90 100644
--- a/media/libjpeg/simd/jcsample-altivec.c
+++ b/media/libjpeg/simd/powerpc/jcsample-altivec.c
@@ -26,14 +26,15 @@
 #include "jcsample.h"
 
 
-void
-jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+void jsimd_h2v1_downsample_altivec(JDIMENSION image_width,
+                                   int max_v_samp_factor,
+                                   JDIMENSION v_samp_factor,
+                                   JDIMENSION width_in_blocks,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY output_data)
 {
   int outrow, outcol;
-  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
   JSAMPROW inptr, outptr;
 
   __vector unsigned char this0, next0, out;
@@ -43,7 +44,7 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
   __vector unsigned short pw_bias = { __4X2(0, 1) },
     pw_one = { __8X(1) };
   __vector unsigned char even_odd_index =
-    {0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15},
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
     pb_zero = { __16X(0) };
 
   expand_right_edge(input_data, max_v_samp_factor, image_width,
@@ -83,13 +84,13 @@ jsimd_h2v1_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
 
 
 void
-jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
-                               JDIMENSION v_samp_factor,
-                               JDIMENSION width_blocks,
-                               JSAMPARRAY input_data, JSAMPARRAY output_data)
+jsimd_h2v2_downsample_altivec(JDIMENSION image_width, int max_v_samp_factor,
+                              JDIMENSION v_samp_factor,
+                              JDIMENSION width_in_blocks,
+                              JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
   int inrow, outrow, outcol;
-  JDIMENSION output_cols = width_blocks * DCTSIZE;
+  JDIMENSION output_cols = width_in_blocks * DCTSIZE;
   JSAMPROW inptr0, inptr1, outptr;
 
   __vector unsigned char this0, next0, this1, next1, out;
@@ -100,7 +101,7 @@ jsimd_h2v2_downsample_altivec (JDIMENSION image_width, int max_v_samp_factor,
   __vector unsigned short pw_bias = { __4X2(1, 2) },
     pw_two = { __8X(2) };
   __vector unsigned char even_odd_index =
-    { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+    {  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15 },
     pb_zero = { __16X(0) };
 
   expand_right_edge(input_data, max_v_samp_factor, image_width,
diff --git a/media/libjpeg/simd/powerpc/jcsample.h b/media/libjpeg/simd/powerpc/jcsample.h
new file mode 100644
index 0000000000..bd07fcc4ed
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jcsample.h
@@ -0,0 +1,28 @@
+/*
+ * jcsample.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1996, Thomas G. Lane.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+LOCAL(void)
+expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
+                  JDIMENSION output_cols)
+{
+  register JSAMPROW ptr;
+  register JSAMPLE pixval;
+  register int count;
+  int row;
+  int numcols = (int)(output_cols - input_cols);
+
+  if (numcols > 0) {
+    for (row = 0; row < num_rows; row++) {
+      ptr = image_data[row] + input_cols;
+      pixval = ptr[-1];
+      for (count = numcols; count > 0; count--)
+        *ptr++ = pixval;
+    }
+  }
+}
diff --git a/media/libjpeg/simd/jdcolext-altivec.c b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
similarity index 96%
rename from media/libjpeg/simd/jdcolext-altivec.c
rename to media/libjpeg/simd/powerpc/jdcolext-altivec.c
index fb121ce745..68d52bd8a2 100644
--- a/media/libjpeg/simd/jdcolext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdcolext-altivec.c
@@ -23,9 +23,9 @@
 /* This file is included by jdcolor-altivec.c */
 
 
-void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
-                                    JDIMENSION input_row,
-                                    JSAMPARRAY output_buf, int num_rows)
+void jsimd_ycc_rgb_convert_altivec(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                                   JDIMENSION input_row, JSAMPARRAY output_buf,
+                                   int num_rows)
 {
   JSAMPROW outptr, inptr0, inptr1, inptr2;
   int pitch = out_width * RGB_PIXELSIZE, num_cols;
@@ -61,9 +61,11 @@ void jsimd_ycc_rgb_convert_altivec (JDIMENSION out_width, JSAMPIMAGE input_buf,
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
 #endif
 
   while (--num_rows >= 0) {
diff --git a/media/libjpeg/simd/powerpc/jdcolor-altivec.c b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
new file mode 100644
index 0000000000..eb35b67176
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdcolor-altivec.c
@@ -0,0 +1,106 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extrgbx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extbgrx_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxbgr_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_ycc_rgb_convert_altivec  jsimd_ycc_extxrgb_convert_altivec
+#include "jdcolext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_ycc_rgb_convert_altivec
diff --git a/media/libjpeg/simd/powerpc/jdmerge-altivec.c b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
new file mode 100644
index 0000000000..79c577f141
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jdmerge-altivec.c
@@ -0,0 +1,130 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* MERGED YCC --> RGB CONVERSION AND UPSAMPLING */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_344  22554              /* FIX(0.34414) */
+#define F_0_714  46802              /* FIX(0.71414) */
+#define F_1_402  91881              /* FIX(1.40200) */
+#define F_1_772  116130             /* FIX(1.77200) */
+#define F_0_402  (F_1_402 - 65536)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  (65536 - F_0_714)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  (131072 - F_1_772) /* FIX(2) - FIX(1.77200) */
+
+#define SCALEBITS  16
+#define ONE_HALF  (1 << (SCALEBITS - 1))
+
+#define RGB_INDEX0 \
+  {  0,  1,  8,  2,  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18 }
+#define RGB_INDEX1 \
+  {  3, 10,  4,  5, 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21 }
+#define RGB_INDEX2 \
+  { 12,  6,  7, 14, 16, 17, 24, 18, 19, 26, 20, 21, 28, 22, 23, 30 }
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define RGB_INDEX \
+  {  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extrgbx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extrgbx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define RGB_INDEX0 \
+  {  8,  1,  0, 10,  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26 }
+#define RGB_INDEX1 \
+  {  3,  2, 12,  5,  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21 }
+#define RGB_INDEX2 \
+  {  4, 14,  7,  6, 24, 17, 16, 26, 19, 18, 28, 21, 20, 30, 23, 22 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX0
+#undef RGB_INDEX1
+#undef RGB_INDEX2
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define RGB_INDEX \
+  {  8,  1,  0,  9, 10,  3,  2, 11, 12,  5,  4, 13, 14,  7,  6, 15 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extbgrx_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extbgrx_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  8,  1,  0, 11, 10,  3,  2, 13, 12,  5,  4, 15, 14,  7,  6 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxbgr_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxbgr_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
+
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define RGB_INDEX \
+  {  9,  0,  1,  8, 11,  2,  3, 10, 13,  4,  5, 12, 15,  6,  7, 14 }
+#define jsimd_h2v1_merged_upsample_altivec \
+  jsimd_h2v1_extxrgb_merged_upsample_altivec
+#define jsimd_h2v2_merged_upsample_altivec \
+  jsimd_h2v2_extxrgb_merged_upsample_altivec
+#include "jdmrgext-altivec.c"
+#undef RGB_PIXELSIZE
+#undef RGB_INDEX
+#undef jsimd_h2v1_merged_upsample_altivec
+#undef jsimd_h2v2_merged_upsample_altivec
diff --git a/media/libjpeg/simd/jdmrgext-altivec.c b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
similarity index 91%
rename from media/libjpeg/simd/jdmrgext-altivec.c
rename to media/libjpeg/simd/powerpc/jdmrgext-altivec.c
index 55205bb1f9..40f02c33ea 100644
--- a/media/libjpeg/simd/jdmrgext-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdmrgext-altivec.c
@@ -23,10 +23,10 @@
 /* This file is included by jdmerge-altivec.c */
 
 
-void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
-                                         JSAMPIMAGE input_buf,
-                                         JDIMENSION in_row_group_ctr,
-                                         JSAMPARRAY output_buf)
+void jsimd_h2v1_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
 {
   JSAMPROW outptr, inptr0, inptr1, inptr2;
   int pitch = output_width * RGB_PIXELSIZE, num_cols, yloop;
@@ -63,13 +63,19 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
   __vector int pd_onehalf = { __4X(ONE_HALF) };
   __vector unsigned char pb_zero = { __16X(0) },
 #if __BIG_ENDIAN__
-    shift_pack_index = {0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29},
-    even_index = {0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30},
-    odd_index = {0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31};
+    shift_pack_index =
+      {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+    even_index =
+      {  0, 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30 },
+    odd_index =
+      {  0, 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31 };
 #else
-    shift_pack_index = {2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31},
-    even_index = {16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0},
-    odd_index = {17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0};
+    shift_pack_index =
+      {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+    even_index =
+      { 16,  0, 18,  0, 20,  0, 22,  0, 24,  0, 26,  0, 28,  0, 30,  0 },
+    odd_index =
+      { 17,  0, 19,  0, 21,  0, 23,  0, 25,  0, 27,  0, 29,  0, 31,  0 };
 #endif
 
   inptr0 = input_buf[0][in_row_group_ctr];
@@ -299,10 +305,10 @@ void jsimd_h2v1_merged_upsample_altivec (JDIMENSION output_width,
 }
 
 
-void jsimd_h2v2_merged_upsample_altivec (JDIMENSION output_width,
-                                         JSAMPIMAGE input_buf,
-                                         JDIMENSION in_row_group_ctr,
-                                         JSAMPARRAY output_buf)
+void jsimd_h2v2_merged_upsample_altivec(JDIMENSION output_width,
+                                        JSAMPIMAGE input_buf,
+                                        JDIMENSION in_row_group_ctr,
+                                        JSAMPARRAY output_buf)
 {
   JSAMPROW inptr, outptr;
 
diff --git a/media/libjpeg/simd/jdsample-altivec.c b/media/libjpeg/simd/powerpc/jdsample-altivec.c
similarity index 83%
rename from media/libjpeg/simd/jdsample-altivec.c
rename to media/libjpeg/simd/powerpc/jdsample-altivec.c
index b40ce55c89..04df0cf108 100644
--- a/media/libjpeg/simd/jdsample-altivec.c
+++ b/media/libjpeg/simd/powerpc/jdsample-altivec.c
@@ -25,31 +25,36 @@
 #include "jsimd_altivec.h"
 
 
-void
-jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
   int inrow, incol;
 
-  __vector unsigned char this0, last0, p_last0, next0 = {0}, p_next0,
+  __vector unsigned char this0, last0, p_last0, next0 = { 0 }, p_next0,
     out;
   __vector short this0e, this0o, this0l, this0h, last0l, last0h,
     next0l, next0h, outle, outhe, outlo, outho;
 
   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) }, pb_three = { __16X(3) },
-    last_index_col0 = {0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14},
-    last_index = {15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30},
-    next_index = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16},
-    next_index_lastcol = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,15},
+    last_index_col0 =
+      {  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14 },
+    last_index =
+      { 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 },
+    next_index =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16 },
+    next_index_lastcol =
+      {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 15 },
 #if __BIG_ENDIAN__
-    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
 #else
-    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
 #endif
   __vector short pw_one = { __8X(1) }, pw_two = { __8X(2) };
 
@@ -121,11 +126,10 @@ jsimd_h2v1_fancy_upsample_altivec (int max_v_samp_factor,
 }
 
 
-void
-jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_fancy_upsample_altivec(int max_v_samp_factor,
+                                       JDIMENSION downsampled_width,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
@@ -136,21 +140,27 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
     lastcolsum_1h, lastcolsum1h,
     p_lastcolsum_1l, p_lastcolsum_1h, p_lastcolsum1l, p_lastcolsum1h,
     thiscolsum_1l, thiscolsum_1h, thiscolsum1l, thiscolsum1h,
-    nextcolsum_1l = {0}, nextcolsum_1h = {0},
-    nextcolsum1l = {0}, nextcolsum1h = {0},
+    nextcolsum_1l = { 0 }, nextcolsum_1h = { 0 },
+    nextcolsum1l = { 0 }, nextcolsum1h = { 0 },
     p_nextcolsum_1l, p_nextcolsum_1h, p_nextcolsum1l, p_nextcolsum1h,
     tmpl, tmph, outle, outhe, outlo, outho;
 
   /* Constants */
   __vector unsigned char pb_zero = { __16X(0) },
-    last_index_col0 = {0,1,0,1,2,3,4,5,6,7,8,9,10,11,12,13},
-    last_index={14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29},
-    next_index = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17},
-    next_index_lastcol = {2,3,4,5,6,7,8,9,10,11,12,13,14,15,14,15},
+    last_index_col0 =
+      {  0,  1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13 },
+    last_index =
+      { 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+    next_index =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17 },
+    next_index_lastcol =
+      {  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15 },
 #if __BIG_ENDIAN__
-    merge_pack_index = {1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31};
+    merge_pack_index =
+      {  1, 17,  3, 19,  5, 21,  7, 23,  9, 25, 11, 27, 13, 29, 15, 31 };
 #else
-    merge_pack_index = {0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30};
+    merge_pack_index =
+      {  0, 16,  2, 18,  4, 20,  6, 22,  8, 24, 10, 26, 12, 28, 14, 30 };
 #endif
   __vector short pw_zero = { __8X(0) }, pw_three = { __8X(3) },
     pw_seven = { __8X(7) }, pw_eight = { __8X(8) };
@@ -306,11 +316,10 @@ jsimd_h2v2_fancy_upsample_altivec (int max_v_samp_factor,
 
 /* These are rarely used (mainly just for decompressing YCCK images) */
 
-void
-jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
-                             JDIMENSION output_width,
-                             JSAMPARRAY input_data,
-                             JSAMPARRAY *output_data_ptr)
+void jsimd_h2v1_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr;
@@ -345,11 +354,10 @@ jsimd_h2v1_upsample_altivec (int max_v_samp_factor,
 }
 
 
-void
-jsimd_h2v2_upsample_altivec (int max_v_samp_factor,
-                             JDIMENSION output_width,
-                             JSAMPARRAY input_data,
-                             JSAMPARRAY *output_data_ptr)
+void jsimd_h2v2_upsample_altivec(int max_v_samp_factor,
+                                 JDIMENSION output_width,
+                                 JSAMPARRAY input_data,
+                                 JSAMPARRAY *output_data_ptr)
 {
   JSAMPARRAY output_data = *output_data_ptr;
   JSAMPROW inptr, outptr0, outptr1;
diff --git a/media/libjpeg/simd/jfdctfst-altivec.c b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
similarity index 68%
rename from media/libjpeg/simd/jfdctfst-altivec.c
rename to media/libjpeg/simd/powerpc/jfdctfst-altivec.c
index 04157f77ea..ad9af81e0c 100644
--- a/media/libjpeg/simd/jfdctfst-altivec.c
+++ b/media/libjpeg/simd/powerpc/jfdctfst-altivec.c
@@ -32,64 +32,62 @@
 #include "jsimd_altivec.h"
 
 
-#define F_0_382 98   /* FIX(0.382683433) */
-#define F_0_541 139  /* FIX(0.541196100) */
-#define F_0_707 181  /* FIX(0.707106781) */
-#define F_1_306 334  /* FIX(1.306562965) */
+#define F_0_382  98   /* FIX(0.382683433) */
+#define F_0_541  139  /* FIX(0.541196100) */
+#define F_0_707  181  /* FIX(0.707106781) */
+#define F_1_306  334  /* FIX(1.306562965) */
 
-#define CONST_BITS 8
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+#define CONST_BITS  8
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
 
 
-#define DO_FDCT()  \
-{  \
-  /* Even part */  \
+#define DO_FDCT() { \
+  /* Even part */ \
   \
-  tmp10 = vec_add(tmp0, tmp3);  \
-  tmp13 = vec_sub(tmp0, tmp3);  \
-  tmp11 = vec_add(tmp1, tmp2);  \
-  tmp12 = vec_sub(tmp1, tmp2);  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
   \
-  out0  = vec_add(tmp10, tmp11);  \
-  out4  = vec_sub(tmp10, tmp11);  \
+  out0  = vec_add(tmp10, tmp11); \
+  out4  = vec_sub(tmp10, tmp11); \
   \
-  z1 = vec_add(tmp12, tmp13);  \
-  z1 = vec_sl(z1, pre_multiply_scale_bits);  \
-  z1 = vec_madds(z1, pw_0707, pw_zero);  \
+  z1 = vec_add(tmp12, tmp13); \
+  z1 = vec_sl(z1, pre_multiply_scale_bits); \
+  z1 = vec_madds(z1, pw_0707, pw_zero); \
   \
-  out2 = vec_add(tmp13, z1);  \
-  out6 = vec_sub(tmp13, z1);  \
+  out2 = vec_add(tmp13, z1); \
+  out6 = vec_sub(tmp13, z1); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  tmp10 = vec_add(tmp4, tmp5);  \
-  tmp11 = vec_add(tmp5, tmp6);  \
-  tmp12 = vec_add(tmp6, tmp7);  \
+  tmp10 = vec_add(tmp4, tmp5); \
+  tmp11 = vec_add(tmp5, tmp6); \
+  tmp12 = vec_add(tmp6, tmp7); \
   \
-  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits);  \
-  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
-  z5 = vec_sub(tmp10, tmp12);  \
-  z5 = vec_madds(z5, pw_0382, pw_zero);  \
+  tmp10 = vec_sl(tmp10, pre_multiply_scale_bits); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  z5 = vec_sub(tmp10, tmp12); \
+  z5 = vec_madds(z5, pw_0382, pw_zero); \
   \
-  z2 = vec_madds(tmp10, pw_0541, z5);  \
-  z4 = vec_madds(tmp12, pw_1306, z5);  \
+  z2 = vec_madds(tmp10, pw_0541, z5); \
+  z4 = vec_madds(tmp12, pw_1306, z5); \
   \
-  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
-  z3 = vec_madds(tmp11, pw_0707, pw_zero);  \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  z3 = vec_madds(tmp11, pw_0707, pw_zero); \
   \
-  z11 = vec_add(tmp7, z3);  \
-  z13 = vec_sub(tmp7, z3);  \
+  z11 = vec_add(tmp7, z3); \
+  z13 = vec_sub(tmp7, z3); \
   \
-  out5 = vec_add(z13, z2);  \
-  out3 = vec_sub(z13, z2);  \
-  out1 = vec_add(z11, z4);  \
-  out7 = vec_sub(z11, z4);  \
+  out5 = vec_add(z13, z2); \
+  out3 = vec_sub(z13, z2); \
+  out1 = vec_add(z11, z4); \
+  out7 = vec_sub(z11, z4); \
 }
 
 
-void
-jsimd_fdct_ifast_altivec (DCTELEM *data)
+void jsimd_fdct_ifast_altivec(DCTELEM *data)
 {
   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
     col0, col1, col2, col3, col4, col5, col6, col7,
diff --git a/media/libjpeg/simd/powerpc/jfdctint-altivec.c b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
new file mode 100644
index 0000000000..3d4f017103
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jfdctint-altivec.c
@@ -0,0 +1,258 @@
+/*
+ * AltiVec optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* ACCURATE INTEGER FORWARD DCT */
+
+#include "jsimd_altivec.h"
+
+
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+
+#define DO_FDCT_COMMON(PASS) { \
+  /* (Original) \
+   * z1 = (tmp12 + tmp13) * 0.541196100; \
+   * data2 = z1 + tmp13 * 0.765366865; \
+   * data6 = z1 + tmp12 * -1.847759065; \
+   * \
+   * (This implementation) \
+   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; \
+   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); \
+   */ \
+  \
+  tmp1312l = vec_mergeh(tmp13, tmp12); \
+  tmp1312h = vec_mergel(tmp13, tmp12); \
+  \
+  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS); \
+  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS); \
+  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS); \
+  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS); \
+  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
+  \
+  out2 = vec_pack(out2l, out2h); \
+  out6 = vec_pack(out6l, out6h); \
+  \
+  /* Odd part */ \
+  \
+  z3 = vec_add(tmp4, tmp6); \
+  z4 = vec_add(tmp5, tmp7); \
+  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
+  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
+  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS); \
+  \
+  /* (Original) \
+   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6; \
+   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869; \
+   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4; \
+   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; \
+   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; \
+   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); \
+   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); \
+   * data7 = tmp4 + z3;  data5 = tmp5 + z4; \
+   * data3 = tmp6 + z3;  data1 = tmp7 + z4; \
+   */ \
+  \
+  tmp47l = vec_mergeh(tmp4, tmp7); \
+  tmp47h = vec_mergel(tmp4, tmp7); \
+  \
+  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l); \
+  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h); \
+  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l); \
+  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h); \
+  \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  \
+  out7 = vec_pack(out7l, out7h); \
+  out1 = vec_pack(out1l, out1h); \
+  \
+  tmp56l = vec_mergeh(tmp5, tmp6); \
+  tmp56h = vec_mergel(tmp5, tmp6); \
+  \
+  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l); \
+  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h); \
+  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l); \
+  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h); \
+  \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  \
+  out5 = vec_pack(out5l, out5h); \
+  out3 = vec_pack(out3l, out3h); \
+}
+
+#define DO_FDCT_PASS1() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_sl(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_sl(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(1); \
+}
+
+#define DO_FDCT_PASS2() { \
+  /* Even part */ \
+  \
+  tmp10 = vec_add(tmp0, tmp3); \
+  tmp13 = vec_sub(tmp0, tmp3); \
+  tmp11 = vec_add(tmp1, tmp2); \
+  tmp12 = vec_sub(tmp1, tmp2); \
+  \
+  out0  = vec_add(tmp10, tmp11); \
+  out0  = vec_add(out0, pw_descale_p2x); \
+  out0  = vec_sra(out0, pass1_bits); \
+  out4  = vec_sub(tmp10, tmp11); \
+  out4  = vec_add(out4, pw_descale_p2x); \
+  out4  = vec_sra(out4, pass1_bits); \
+  \
+  DO_FDCT_COMMON(2); \
+}
+
+
+void jsimd_fdct_islow_altivec(DCTELEM *data)
+{
+  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
+    col0, col1, col2, col3, col4, col5, col6, col7,
+    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
+    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
+    z3, z4, z34l, z34h,
+    out0, out1, out2, out3, out4, out5, out6, out7;
+  __vector int z3l, z3h, z4l, z4h,
+    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
+    out7l, out7h;
+
+  /* Constants */
+  __vector short
+    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
+    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
+    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
+    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
+    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
+    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
+    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
+    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
+    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
+  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
+  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
+    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
+  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
+    descale_p2 = { __4X(DESCALE_P2) };
+
+  /* Pass 1: process rows */
+
+  row0 = vec_ld(0, data);
+  row1 = vec_ld(16, data);
+  row2 = vec_ld(32, data);
+  row3 = vec_ld(48, data);
+  row4 = vec_ld(64, data);
+  row5 = vec_ld(80, data);
+  row6 = vec_ld(96, data);
+  row7 = vec_ld(112, data);
+
+  TRANSPOSE(row, col);
+
+  tmp0 = vec_add(col0, col7);
+  tmp7 = vec_sub(col0, col7);
+  tmp1 = vec_add(col1, col6);
+  tmp6 = vec_sub(col1, col6);
+  tmp2 = vec_add(col2, col5);
+  tmp5 = vec_sub(col2, col5);
+  tmp3 = vec_add(col3, col4);
+  tmp4 = vec_sub(col3, col4);
+
+  DO_FDCT_PASS1();
+
+  /* Pass 2: process columns */
+
+  TRANSPOSE(out, row);
+
+  tmp0 = vec_add(row0, row7);
+  tmp7 = vec_sub(row0, row7);
+  tmp1 = vec_add(row1, row6);
+  tmp6 = vec_sub(row1, row6);
+  tmp2 = vec_add(row2, row5);
+  tmp5 = vec_sub(row2, row5);
+  tmp3 = vec_add(row3, row4);
+  tmp4 = vec_sub(row3, row4);
+
+  DO_FDCT_PASS2();
+
+  vec_st(out0, 0, data);
+  vec_st(out1, 16, data);
+  vec_st(out2, 32, data);
+  vec_st(out3, 48, data);
+  vec_st(out4, 64, data);
+  vec_st(out5, 80, data);
+  vec_st(out6, 96, data);
+  vec_st(out7, 112, data);
+}
diff --git a/media/libjpeg/simd/jidctfst-altivec.c b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
similarity index 71%
rename from media/libjpeg/simd/jidctfst-altivec.c
rename to media/libjpeg/simd/powerpc/jidctfst-altivec.c
index ec30c3995b..456c6c6174 100644
--- a/media/libjpeg/simd/jidctfst-altivec.c
+++ b/media/libjpeg/simd/powerpc/jidctfst-altivec.c
@@ -32,87 +32,85 @@
 #include "jsimd_altivec.h"
 
 
-#define F_1_082 277              /* FIX(1.082392200) */
-#define F_1_414 362              /* FIX(1.414213562) */
-#define F_1_847 473              /* FIX(1.847759065) */
-#define F_2_613 669              /* FIX(2.613125930) */
-#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
+#define F_1_082  277              /* FIX(1.082392200) */
+#define F_1_414  362              /* FIX(1.414213562) */
+#define F_1_847  473              /* FIX(1.847759065) */
+#define F_2_613  669              /* FIX(2.613125930) */
+#define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
 
-#define CONST_BITS 8
-#define PASS1_BITS 2
-#define PRE_MULTIPLY_SCALE_BITS 2
-#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
+#define CONST_BITS  8
+#define PASS1_BITS  2
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
 
 
-#define DO_IDCT(in)  \
-{  \
-  /* Even part */  \
+#define DO_IDCT(in) { \
+  /* Even part */ \
   \
-  tmp10 = vec_add(in##0, in##4);  \
-  tmp11 = vec_sub(in##0, in##4);  \
-  tmp13 = vec_add(in##2, in##6);  \
+  tmp10 = vec_add(in##0, in##4); \
+  tmp11 = vec_sub(in##0, in##4); \
+  tmp13 = vec_add(in##2, in##6); \
   \
-  tmp12 = vec_sub(in##2, in##6);  \
-  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
-  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero);  \
-  tmp12 = vec_sub(tmp12, tmp13);  \
+  tmp12 = vec_sub(in##2, in##6); \
+  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
+  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
+  tmp12 = vec_sub(tmp12, tmp13); \
   \
-  tmp0 = vec_add(tmp10, tmp13);  \
-  tmp3 = vec_sub(tmp10, tmp13);  \
-  tmp1 = vec_add(tmp11, tmp12);  \
-  tmp2 = vec_sub(tmp11, tmp12);  \
+  tmp0 = vec_add(tmp10, tmp13); \
+  tmp3 = vec_sub(tmp10, tmp13); \
+  tmp1 = vec_add(tmp11, tmp12); \
+  tmp2 = vec_sub(tmp11, tmp12); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  z13 = vec_add(in##5, in##3);  \
-  z10 = vec_sub(in##5, in##3);  \
-  z10s = vec_sl(z10, pre_multiply_scale_bits);  \
-  z11 = vec_add(in##1, in##7);  \
-  z12s = vec_sub(in##1, in##7);  \
-  z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
+  z13 = vec_add(in##5, in##3); \
+  z10 = vec_sub(in##5, in##3); \
+  z10s = vec_sl(z10, pre_multiply_scale_bits); \
+  z11 = vec_add(in##1, in##7); \
+  z12s = vec_sub(in##1, in##7); \
+  z12s = vec_sl(z12s, pre_multiply_scale_bits); \
   \
-  tmp11 = vec_sub(z11, z13);  \
-  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
-  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero);  \
+  tmp11 = vec_sub(z11, z13); \
+  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
+  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
   \
-  tmp7 = vec_add(z11, z13);  \
+  tmp7 = vec_add(z11, z13); \
   \
-  /* To avoid overflow...  \
-   *  \
-   * (Original)  \
-   * tmp12 = -2.613125930 * z10 + z5;  \
-   *  \
-   * (This implementation)  \
-   * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
-   *       = -1.613125930 * z10 - z10 + z5;  \
-   */  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
   \
-  z5 = vec_add(z10s, z12s);  \
-  z5 = vec_madds(z5, pw_F1847, pw_zero);  \
+  z5 = vec_add(z10s, z12s); \
+  z5 = vec_madds(z5, pw_F1847, pw_zero); \
   \
-  tmp10 = vec_madds(z12s, pw_F1082, pw_zero);  \
-  tmp10 = vec_sub(tmp10, z5);  \
-  tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
-  tmp12 = vec_sub(tmp12, z10);  \
+  tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
+  tmp10 = vec_sub(tmp10, z5); \
+  tmp12 = vec_madds(z10s, pw_MF1613, z5); \
+  tmp12 = vec_sub(tmp12, z10); \
   \
-  tmp6 = vec_sub(tmp12, tmp7);  \
-  tmp5 = vec_sub(tmp11, tmp6);  \
-  tmp4 = vec_add(tmp10, tmp5);  \
+  tmp6 = vec_sub(tmp12, tmp7); \
+  tmp5 = vec_sub(tmp11, tmp6); \
+  tmp4 = vec_add(tmp10, tmp5); \
   \
-  out0 = vec_add(tmp0, tmp7);  \
-  out1 = vec_add(tmp1, tmp6);  \
-  out2 = vec_add(tmp2, tmp5);  \
-  out3 = vec_sub(tmp3, tmp4);  \
-  out4 = vec_add(tmp3, tmp4);  \
-  out5 = vec_sub(tmp2, tmp5);  \
-  out6 = vec_sub(tmp1, tmp6);  \
-  out7 = vec_sub(tmp0, tmp7);  \
+  out0 = vec_add(tmp0, tmp7); \
+  out1 = vec_add(tmp1, tmp6); \
+  out2 = vec_add(tmp2, tmp5); \
+  out3 = vec_sub(tmp3, tmp4); \
+  out4 = vec_add(tmp3, tmp4); \
+  out5 = vec_sub(tmp2, tmp5); \
+  out6 = vec_sub(tmp1, tmp6); \
+  out7 = vec_sub(tmp0, tmp7); \
 }
 
 
-void
-jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
+void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   short *dct_table = (short *)dct_table_;
   int *outptr;
diff --git a/media/libjpeg/simd/jidctint-altivec.c b/media/libjpeg/simd/powerpc/jidctint-altivec.c
similarity index 52%
rename from media/libjpeg/simd/jidctint-altivec.c
rename to media/libjpeg/simd/powerpc/jidctint-altivec.c
index 935f35d1eb..60e619f11d 100644
--- a/media/libjpeg/simd/jidctint-altivec.c
+++ b/media/libjpeg/simd/powerpc/jidctint-altivec.c
@@ -1,7 +1,7 @@
 /*
  * AltiVec optimizations for libjpeg-turbo
  *
- * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2014-2015, 2020, D. R. Commander.  All Rights Reserved.
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -20,194 +20,192 @@
  * 3. This notice may not be removed or altered from any source distribution.
  */
 
-/* SLOW INTEGER INVERSE DCT */
+/* ACCURATE INTEGER INVERSE DCT */
 
 #include "jsimd_altivec.h"
 
 
-#define F_0_298 2446   /* FIX(0.298631336) */
-#define F_0_390 3196   /* FIX(0.390180644) */
-#define F_0_541 4433   /* FIX(0.541196100) */
-#define F_0_765 6270   /* FIX(0.765366865) */
-#define F_0_899 7373   /* FIX(0.899976223) */
-#define F_1_175 9633   /* FIX(1.175875602) */
-#define F_1_501 12299  /* FIX(1.501321110) */
-#define F_1_847 15137  /* FIX(1.847759065) */
-#define F_1_961 16069  /* FIX(1.961570560) */
-#define F_2_053 16819  /* FIX(2.053119869) */
-#define F_2_562 20995  /* FIX(2.562915447) */
-#define F_3_072 25172  /* FIX(3.072711026) */
+#define F_0_298  2446   /* FIX(0.298631336) */
+#define F_0_390  3196   /* FIX(0.390180644) */
+#define F_0_541  4433   /* FIX(0.541196100) */
+#define F_0_765  6270   /* FIX(0.765366865) */
+#define F_0_899  7373   /* FIX(0.899976223) */
+#define F_1_175  9633   /* FIX(1.175875602) */
+#define F_1_501  12299  /* FIX(1.501321110) */
+#define F_1_847  15137  /* FIX(1.847759065) */
+#define F_1_961  16069  /* FIX(1.961570560) */
+#define F_2_053  16819  /* FIX(2.053119869) */
+#define F_2_562  20995  /* FIX(2.562915447) */
+#define F_3_072  25172  /* FIX(3.072711026) */
 
-#define CONST_BITS 13
-#define PASS1_BITS 2
-#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
-#define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
 
 
-#define DO_IDCT(in, PASS)  \
-{  \
-  /* Even part  \
-   *  \
-   * (Original)  \
-   * z1 = (z2 + z3) * 0.541196100;  \
-   * tmp2 = z1 + z3 * -1.847759065;  \
-   * tmp3 = z1 + z2 * 0.765366865;  \
-   *  \
-   * (This implementation)  \
-   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);  \
-   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;  \
-   */  \
+#define DO_IDCT(in, PASS) { \
+  /* Even part \
+   * \
+   * (Original) \
+   * z1 = (z2 + z3) * 0.541196100; \
+   * tmp2 = z1 + z3 * -1.847759065; \
+   * tmp3 = z1 + z2 * 0.765366865; \
+   * \
+   * (This implementation) \
+   * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
+   * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
+   */ \
   \
-  in##26l = vec_mergeh(in##2, in##6);  \
-  in##26h = vec_mergel(in##2, in##6);  \
+  in##26l = vec_mergeh(in##2, in##6); \
+  in##26h = vec_mergel(in##2, in##6); \
   \
-  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero);  \
-  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero);  \
-  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero);  \
-  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero);  \
+  tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
+  tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
+  tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
+  tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
   \
-  tmp0 = vec_add(in##0, in##4);  \
-  tmp1 = vec_sub(in##0, in##4);  \
+  tmp0 = vec_add(in##0, in##4); \
+  tmp1 = vec_sub(in##0, in##4); \
   \
-  tmp0l = vec_unpackh(tmp0);  \
-  tmp0h = vec_unpackl(tmp0);  \
-  tmp0l = vec_sl(tmp0l, const_bits);  \
-  tmp0h = vec_sl(tmp0h, const_bits);  \
-  tmp0l = vec_add(tmp0l, pd_descale_p##PASS);  \
-  tmp0h = vec_add(tmp0h, pd_descale_p##PASS);  \
+  tmp0l = vec_unpackh(tmp0); \
+  tmp0h = vec_unpackl(tmp0); \
+  tmp0l = vec_sl(tmp0l, const_bits); \
+  tmp0h = vec_sl(tmp0h, const_bits); \
+  tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
+  tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
   \
-  tmp10l = vec_add(tmp0l, tmp3l);  \
-  tmp10h = vec_add(tmp0h, tmp3h);  \
-  tmp13l = vec_sub(tmp0l, tmp3l);  \
-  tmp13h = vec_sub(tmp0h, tmp3h);  \
+  tmp10l = vec_add(tmp0l, tmp3l); \
+  tmp10h = vec_add(tmp0h, tmp3h); \
+  tmp13l = vec_sub(tmp0l, tmp3l); \
+  tmp13h = vec_sub(tmp0h, tmp3h); \
   \
-  tmp1l = vec_unpackh(tmp1);  \
-  tmp1h = vec_unpackl(tmp1);  \
-  tmp1l = vec_sl(tmp1l, const_bits);  \
-  tmp1h = vec_sl(tmp1h, const_bits);  \
-  tmp1l = vec_add(tmp1l, pd_descale_p##PASS);  \
-  tmp1h = vec_add(tmp1h, pd_descale_p##PASS);  \
+  tmp1l = vec_unpackh(tmp1); \
+  tmp1h = vec_unpackl(tmp1); \
+  tmp1l = vec_sl(tmp1l, const_bits); \
+  tmp1h = vec_sl(tmp1h, const_bits); \
+  tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
+  tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
   \
-  tmp11l = vec_add(tmp1l, tmp2l);  \
-  tmp11h = vec_add(tmp1h, tmp2h);  \
-  tmp12l = vec_sub(tmp1l, tmp2l);  \
-  tmp12h = vec_sub(tmp1h, tmp2h);  \
+  tmp11l = vec_add(tmp1l, tmp2l); \
+  tmp11h = vec_add(tmp1h, tmp2h); \
+  tmp12l = vec_sub(tmp1l, tmp2l); \
+  tmp12h = vec_sub(tmp1h, tmp2h); \
   \
-  /* Odd part */  \
+  /* Odd part */ \
   \
-  z3 = vec_add(in##3, in##7);  \
-  z4 = vec_add(in##1, in##5);  \
+  z3 = vec_add(in##3, in##7); \
+  z4 = vec_add(in##1, in##5); \
   \
-  /* (Original)  \
-   * z5 = (z3 + z4) * 1.175875602;  \
-   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
-   * z3 += z5;  z4 += z5;  \
-   *  \
-   * (This implementation)  \
-   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
-   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
-   */  \
+  /* (Original) \
+   * z5 = (z3 + z4) * 1.175875602; \
+   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
+   * z3 += z5;  z4 += z5; \
+   * \
+   * (This implementation) \
+   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
+   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
+   */ \
   \
-  z34l = vec_mergeh(z3, z4);  \
-  z34h = vec_mergel(z3, z4);  \
+  z34l = vec_mergeh(z3, z4); \
+  z34h = vec_mergel(z3, z4); \
   \
-  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero);  \
-  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero);  \
-  z4l = vec_msums(z34l, pw_f117_f078, pd_zero);  \
-  z4h = vec_msums(z34h, pw_f117_f078, pd_zero);  \
+  z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
+  z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
+  z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
+  z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
   \
-  /* (Original)  \
-   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;  \
-   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;  \
-   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;  \
-   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
-   * tmp0 += z1 + z3;  tmp1 += z2 + z4;  \
-   * tmp2 += z2 + z3;  tmp3 += z1 + z4;  \
-   *  \
-   * (This implementation)  \
-   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;  \
-   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;  \
-   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);  \
-   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);  \
-   * tmp0 += z3;  tmp1 += z4;  \
-   * tmp2 += z3;  tmp3 += z4;  \
-   */  \
+  /* (Original) \
+   * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
+   * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
+   * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
+   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
+   * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
+   * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
+   * \
+   * (This implementation) \
+   * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
+   * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
+   * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
+   * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
+   * tmp0 += z3;  tmp1 += z4; \
+   * tmp2 += z3;  tmp3 += z4; \
+   */ \
   \
-  in##71l = vec_mergeh(in##7, in##1);  \
-  in##71h = vec_mergel(in##7, in##1);  \
+  in##71l = vec_mergeh(in##7, in##1); \
+  in##71h = vec_mergel(in##7, in##1); \
   \
-  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l);  \
-  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h);  \
-  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l);  \
-  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h);  \
+  tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
+  tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
+  tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
+  tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
   \
-  in##53l = vec_mergeh(in##5, in##3);  \
-  in##53h = vec_mergel(in##5, in##3);  \
+  in##53l = vec_mergeh(in##5, in##3); \
+  in##53h = vec_mergel(in##5, in##3); \
   \
-  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l);  \
-  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h);  \
-  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l);  \
-  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h);  \
+  tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
+  tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
+  tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
+  tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
   \
-  /* Final output stage */  \
+  /* Final output stage */ \
   \
-  out0l = vec_add(tmp10l, tmp3l);  \
-  out0h = vec_add(tmp10h, tmp3h);  \
-  out7l = vec_sub(tmp10l, tmp3l);  \
-  out7h = vec_sub(tmp10h, tmp3h);  \
+  out0l = vec_add(tmp10l, tmp3l); \
+  out0h = vec_add(tmp10h, tmp3h); \
+  out7l = vec_sub(tmp10l, tmp3l); \
+  out7h = vec_sub(tmp10h, tmp3h); \
   \
-  out0l = vec_sra(out0l, descale_p##PASS);  \
-  out0h = vec_sra(out0h, descale_p##PASS);  \
-  out7l = vec_sra(out7l, descale_p##PASS);  \
-  out7h = vec_sra(out7h, descale_p##PASS);  \
+  out0l = vec_sra(out0l, descale_p##PASS); \
+  out0h = vec_sra(out0h, descale_p##PASS); \
+  out7l = vec_sra(out7l, descale_p##PASS); \
+  out7h = vec_sra(out7h, descale_p##PASS); \
   \
-  out0 = vec_pack(out0l, out0h);  \
-  out7 = vec_pack(out7l, out7h);  \
+  out0 = vec_pack(out0l, out0h); \
+  out7 = vec_pack(out7l, out7h); \
   \
-  out1l = vec_add(tmp11l, tmp2l);  \
-  out1h = vec_add(tmp11h, tmp2h);  \
-  out6l = vec_sub(tmp11l, tmp2l);  \
-  out6h = vec_sub(tmp11h, tmp2h);  \
+  out1l = vec_add(tmp11l, tmp2l); \
+  out1h = vec_add(tmp11h, tmp2h); \
+  out6l = vec_sub(tmp11l, tmp2l); \
+  out6h = vec_sub(tmp11h, tmp2h); \
   \
-  out1l = vec_sra(out1l, descale_p##PASS);  \
-  out1h = vec_sra(out1h, descale_p##PASS);  \
-  out6l = vec_sra(out6l, descale_p##PASS);  \
-  out6h = vec_sra(out6h, descale_p##PASS);  \
+  out1l = vec_sra(out1l, descale_p##PASS); \
+  out1h = vec_sra(out1h, descale_p##PASS); \
+  out6l = vec_sra(out6l, descale_p##PASS); \
+  out6h = vec_sra(out6h, descale_p##PASS); \
   \
-  out1 = vec_pack(out1l, out1h);  \
-  out6 = vec_pack(out6l, out6h);  \
+  out1 = vec_pack(out1l, out1h); \
+  out6 = vec_pack(out6l, out6h); \
   \
-  out2l = vec_add(tmp12l, tmp1l);  \
-  out2h = vec_add(tmp12h, tmp1h);  \
-  out5l = vec_sub(tmp12l, tmp1l);  \
-  out5h = vec_sub(tmp12h, tmp1h);  \
+  out2l = vec_add(tmp12l, tmp1l); \
+  out2h = vec_add(tmp12h, tmp1h); \
+  out5l = vec_sub(tmp12l, tmp1l); \
+  out5h = vec_sub(tmp12h, tmp1h); \
   \
-  out2l = vec_sra(out2l, descale_p##PASS);  \
-  out2h = vec_sra(out2h, descale_p##PASS);  \
-  out5l = vec_sra(out5l, descale_p##PASS);  \
-  out5h = vec_sra(out5h, descale_p##PASS);  \
+  out2l = vec_sra(out2l, descale_p##PASS); \
+  out2h = vec_sra(out2h, descale_p##PASS); \
+  out5l = vec_sra(out5l, descale_p##PASS); \
+  out5h = vec_sra(out5h, descale_p##PASS); \
   \
-  out2 = vec_pack(out2l, out2h);  \
-  out5 = vec_pack(out5l, out5h);  \
+  out2 = vec_pack(out2l, out2h); \
+  out5 = vec_pack(out5l, out5h); \
   \
-  out3l = vec_add(tmp13l, tmp0l);  \
-  out3h = vec_add(tmp13h, tmp0h);  \
-  out4l = vec_sub(tmp13l, tmp0l);  \
-  out4h = vec_sub(tmp13h, tmp0h);  \
+  out3l = vec_add(tmp13l, tmp0l); \
+  out3h = vec_add(tmp13h, tmp0h); \
+  out4l = vec_sub(tmp13l, tmp0l); \
+  out4h = vec_sub(tmp13h, tmp0h); \
   \
-  out3l = vec_sra(out3l, descale_p##PASS);  \
-  out3h = vec_sra(out3h, descale_p##PASS);  \
-  out4l = vec_sra(out4l, descale_p##PASS);  \
-  out4h = vec_sra(out4h, descale_p##PASS);  \
+  out3l = vec_sra(out3l, descale_p##PASS); \
+  out3h = vec_sra(out3h, descale_p##PASS); \
+  out4l = vec_sra(out4l, descale_p##PASS); \
+  out4h = vec_sra(out4h, descale_p##PASS); \
   \
-  out3 = vec_pack(out3l, out3h);  \
-  out4 = vec_pack(out4l, out4h);  \
+  out3 = vec_pack(out3l, out3h); \
+  out4 = vec_pack(out4l, out4h); \
 }
 
 
-void
-jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
-                          JSAMPARRAY output_buf, JDIMENSION output_col)
+void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
+                              JSAMPARRAY output_buf, JDIMENSION output_col)
 {
   short *dct_table = (short *)dct_table_;
   int *outptr;
diff --git a/media/libjpeg/simd/jquanti-altivec.c b/media/libjpeg/simd/powerpc/jquanti-altivec.c
similarity index 88%
rename from media/libjpeg/simd/jquanti-altivec.c
rename to media/libjpeg/simd/powerpc/jquanti-altivec.c
index 25cc296f7a..7d6e32542b 100644
--- a/media/libjpeg/simd/jquanti-altivec.c
+++ b/media/libjpeg/simd/powerpc/jquanti-altivec.c
@@ -31,26 +31,25 @@
  */
 #if __BIG_ENDIAN__
 
-#define LOAD_ROW(row) {  \
-  elemptr = sample_data[row] + start_col;  \
-  in##row = vec_ld(0, elemptr);  \
-  if ((size_t)elemptr & 15)  \
-    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_ld(0, elemptr); \
+  if ((size_t)elemptr & 15) \
+    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
 }
 
 #else
 
-#define LOAD_ROW(row) {  \
-  elemptr = sample_data[row] + start_col;  \
-  in##row = vec_vsx_ld(0, elemptr);  \
+#define LOAD_ROW(row) { \
+  elemptr = sample_data[row] + start_col; \
+  in##row = vec_vsx_ld(0, elemptr); \
 }
 
 #endif
 
 
-void
-jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
-                        DCTELEM *workspace)
+void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
+                            DCTELEM *workspace)
 {
   JSAMPROW elemptr;
 
@@ -99,24 +98,23 @@ jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
 }
 
 
-#define WORD_BIT 16
+#define WORD_BIT  16
 
 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
    We basically need an unsigned equivalent of vec_madds(). */
 
-#define MULTIPLY(vs0, vs1, out) {  \
-  tmpe = vec_mule((__vector unsigned short)vs0,  \
-                  (__vector unsigned short)vs1);  \
-  tmpo = vec_mulo((__vector unsigned short)vs0,  \
-                  (__vector unsigned short)vs1);  \
-  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
-                                 (__vector unsigned short)tmpo,  \
-                                 shift_pack_index);  \
+#define MULTIPLY(vs0, vs1, out) { \
+  tmpe = vec_mule((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  tmpo = vec_mulo((__vector unsigned short)vs0, \
+                  (__vector unsigned short)vs1); \
+  out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
+                                 (__vector unsigned short)tmpo, \
+                                 shift_pack_index); \
 }
 
-void
-jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
-                        DCTELEM *workspace)
+void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
+                            DCTELEM *workspace)
 {
   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
     row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
@@ -129,10 +127,10 @@ jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
   __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
 #if __BIG_ENDIAN__
   __vector unsigned char shift_pack_index =
-    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
+    {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
 #else
   __vector unsigned char shift_pack_index =
-    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
+    {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
 #endif
 
   row0 = vec_ld(0, workspace);
diff --git a/media/libjpeg/simd/powerpc/jsimd.c b/media/libjpeg/simd/powerpc/jsimd.c
new file mode 100644
index 0000000000..b9e86dcfac
--- /dev/null
+++ b/media/libjpeg/simd/powerpc/jsimd.c
@@ -0,0 +1,881 @@
+/*
+ * jsimd_powerpc.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * PowerPC architecture.
+ */
+
+#ifdef __amigaos4__
+/* This must be defined first as it re-defines GLOBAL otherwise */
+#include <proto/exec.h>
+#endif
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#if defined(__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#elif defined(__FreeBSD__)
+#include <machine/cpu.h>
+#include <sys/auxv.h>
+#endif
+
+static unsigned int simd_support = ~0;
+
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "cpu", 3) != 0)
+    return 0;
+  buffer += 3;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "altivec"))
+        simd_support |= JSIMD_ALTIVEC;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char *env = NULL;
+#endif
+#if !defined(__ALTIVEC__) && (defined(__linux__) || defined(ANDROID) || defined(__ANDROID__))
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#elif defined(__amigaos4__)
+  uint32 altivec = 0;
+#elif defined(__OpenBSD__)
+  int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+  int altivec;
+  size_t len = sizeof(altivec);
+#elif defined(__FreeBSD__)
+  unsigned long cpufeatures = 0;
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = 0;
+
+#if defined(__ALTIVEC__) || defined(__APPLE__)
+  simd_support |= JSIMD_ALTIVEC;
+#elif defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__amigaos4__)
+  IExec->GetCPUInfoTags(GCIT_VectorUnit, &altivec, TAG_DONE);
+  if (altivec == VECTORTYPE_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__OpenBSD__)
+  if (sysctl(mib, 2, &altivec, &len, NULL, 0) == 0 && altivec != 0)
+    simd_support |= JSIMD_ALTIVEC;
+#elif defined(__FreeBSD__)
+  elf_aux_info(AT_HWCAP, &cpufeatures, sizeof(cpufeatures));
+  if (cpufeatures & PPC_FEATURE_HAS_ALTIVEC)
+    simd_support |= JSIMD_ALTIVEC;
+#endif
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEALTIVEC");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_ALTIVEC;
+  env = getenv("JSIMD_FORCENONE");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_ycc_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_ycc_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_ycc_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_ycc_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_extrgb_gray_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_extrgbx_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_extbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_extbgrx_gray_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_extxbgr_gray_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_extxrgb_gray_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_rgb_gray_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_ycc_extrgb_convert_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_ycc_extrgbx_convert_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_ycc_extbgr_convert_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_ycc_extbgrx_convert_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_ycc_extxbgr_convert_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_ycc_extxrgb_convert_altivec;
+    break;
+  default:
+    altivecfct = jsimd_ycc_rgb_convert_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v2_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  jsimd_h2v1_downsample_altivec(cinfo->image_width, cinfo->max_v_samp_factor,
+                                compptr->v_samp_factor,
+                                compptr->width_in_blocks, input_data,
+                                output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_upsample_altivec(cinfo->max_v_samp_factor, cinfo->output_width,
+                              input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v2_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h2v1_fancy_upsample_altivec(cinfo->max_v_samp_factor,
+                                    compptr->downsampled_width, input_data,
+                                    output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v2_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v2_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v2_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v2_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v2_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v2_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v2_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*altivecfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    altivecfct = jsimd_h2v1_extrgb_merged_upsample_altivec;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    altivecfct = jsimd_h2v1_extrgbx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGR:
+    altivecfct = jsimd_h2v1_extbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    altivecfct = jsimd_h2v1_extbgrx_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    altivecfct = jsimd_h2v1_extxbgr_merged_upsample_altivec;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    altivecfct = jsimd_h2v1_extxrgb_merged_upsample_altivec;
+    break;
+  default:
+    altivecfct = jsimd_h2v1_merged_upsample_altivec;
+    break;
+  }
+
+  altivecfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  jsimd_convsamp_altivec(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  jsimd_fdct_islow_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_altivec(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  jsimd_quantize_altivec(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_ALTIVEC)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_islow_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_altivec(compptr->dct_table, coef_block, output_buf,
+                           output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return NULL;
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return 0;
+}
diff --git a/media/libjpeg/simd/jsimd_altivec.h b/media/libjpeg/simd/powerpc/jsimd_altivec.h
similarity index 75%
rename from media/libjpeg/simd/jsimd_altivec.h
rename to media/libjpeg/simd/powerpc/jsimd_altivec.h
index 62dbc5cdf0..e8bdb06a54 100644
--- a/media/libjpeg/simd/jsimd_altivec.h
+++ b/media/libjpeg/simd/powerpc/jsimd_altivec.h
@@ -21,28 +21,27 @@
  */
 
 #define JPEG_INTERNALS
-#include "../jinclude.h"
-#include "../jpeglib.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
 #include "../jsimd.h"
-#include "../jdct.h"
-#include "../jsimddct.h"
-#include "jsimd.h"
 #include <altivec.h>
 
 
 /* Common code */
 
-#define __4X(a) a, a, a, a
-#define __4X2(a, b) a, b, a, b, a, b, a, b
-#define __8X(a) __4X(a), __4X(a)
-#define __16X(a) __8X(a), __8X(a)
+#define __4X(a)      a, a, a, a
+#define __4X2(a, b)  a, b, a, b, a, b, a, b
+#define __8X(a)      __4X(a), __4X(a)
+#define __16X(a)     __8X(a), __8X(a)
 
-#define TRANSPOSE(row, col)  \
-{  \
-  __vector short row04l, row04h, row15l, row15h,  \
-                 row26l, row26h, row37l, row37h;  \
-  __vector short col01e, col01o, col23e, col23o,  \
-                 col45e, col45o, col67e, col67o;  \
+#define TRANSPOSE(row, col) { \
+  __vector short row04l, row04h, row15l, row15h, \
+                 row26l, row26h, row37l, row37h; \
+  __vector short col01e, col01o, col23e, col23o, \
+                 col45e, col45o, col67e, col67o; \
   \
                                        /* transpose coefficients (phase 1) */ \
   row04l = vec_mergeh(row##0, row##4); /* row04l=(00 40 01 41 02 42 03 43) */ \
@@ -65,18 +64,18 @@
   col67o = vec_mergel(row15h, row37h); /* col67o=(16 36 56 76 17 37 57 77) */ \
   \
                                        /* transpose coefficients (phase 3) */ \
-  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */   \
-  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */   \
-  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */   \
-  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */   \
-  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */   \
-  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */   \
-  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */   \
-  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */   \
+  col##0 = vec_mergeh(col01e, col01o); /* col0=(00 10 20 30 40 50 60 70) */ \
+  col##1 = vec_mergel(col01e, col01o); /* col1=(01 11 21 31 41 51 61 71) */ \
+  col##2 = vec_mergeh(col23e, col23o); /* col2=(02 12 22 32 42 52 62 72) */ \
+  col##3 = vec_mergel(col23e, col23o); /* col3=(03 13 23 33 43 53 63 73) */ \
+  col##4 = vec_mergeh(col45e, col45o); /* col4=(04 14 24 34 44 54 64 74) */ \
+  col##5 = vec_mergel(col45e, col45o); /* col5=(05 15 25 35 45 55 65 75) */ \
+  col##6 = vec_mergeh(col67e, col67o); /* col6=(06 16 26 36 46 56 66 76) */ \
+  col##7 = vec_mergel(col67e, col67o); /* col7=(07 17 27 37 47 57 67 77) */ \
 }
 
 #ifndef min
-#define min(a,b) ((a) < (b) ? (a) : (b))
+#define min(a, b)  ((a) < (b) ? (a) : (b))
 #endif
 
 
@@ -84,16 +83,16 @@
 
 #if __BIG_ENDIAN__
 
-#define VEC_LD(a, b) vec_ld(a, b)
-#define VEC_ST(a, b, c) vec_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(pb_zero, a)
-#define VEC_UNPACKLU(a) vec_mergel(pb_zero, a)
+#define VEC_LD(a, b)     vec_ld(a, b)
+#define VEC_ST(a, b, c)  vec_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(pb_zero, a)
+#define VEC_UNPACKLU(a)  vec_mergel(pb_zero, a)
 
 #else
 
-#define VEC_LD(a, b) vec_vsx_ld(a, b)
-#define VEC_ST(a, b, c) vec_vsx_st(a, b, c)
-#define VEC_UNPACKHU(a) vec_mergeh(a, pb_zero)
-#define VEC_UNPACKLU(a) vec_mergel(a, pb_zero)
+#define VEC_LD(a, b)     vec_vsx_ld(a, b)
+#define VEC_ST(a, b, c)  vec_vsx_st(a, b, c)
+#define VEC_UNPACKHU(a)  vec_mergeh(a, pb_zero)
+#define VEC_UNPACKLU(a)  vec_mergel(a, pb_zero)
 
 #endif
diff --git a/media/libjpeg/simd/x86_64/jccolext-avx2.asm b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
new file mode 100644
index 0000000000..ffb527db00
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-avx2.asm
@@ -0,0 +1,559 @@
+;
+; jccolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
+
+EXTN(jsimd_rgb_ycc_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_ycc_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_ycc_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
+    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
+    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
+    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vmovdqa     ymm7, ymm1
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vpxor       ymm1, ymm1, ymm1
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
+    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
+    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
+
+    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm7, ymm7, ymm5
+    vpaddd      ymm4, ymm4, ymm5
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
+    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
+
+    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vmovdqa     ymm5, ymm0
+    vmovdqa     ymm4, ymm6
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vpxor       ymm0, ymm0, ymm0
+    vpxor       ymm6, ymm6, ymm6
+    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
+    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
+    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
+    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm5, ymm5, ymm0
+    vpaddd      ymm4, ymm4, ymm6
+    vpaddd      ymm5, ymm5, ymm1
+    vpaddd      ymm4, ymm4, ymm1
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
+    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
+    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
+
+    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
+    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
+    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vmovdqa     ymm7, ymm0
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vpxor       ymm3, ymm3, ymm3
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
+    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
+    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
+
+    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm7, ymm7, ymm3
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm7, ymm7, ymm1
+    vpaddd      ymm5, ymm5, ymm1
+    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
+    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
+
+    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vmovdqa     ymm1, ymm6
+    vmovdqa     ymm5, ymm4
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm4, ymm4, ymm4
+    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
+    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
+    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
+    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
+
+    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
+
+    vpaddd      ymm1, ymm1, ymm2
+    vpaddd      ymm5, ymm5, ymm4
+    vpaddd      ymm1, ymm1, ymm0
+    vpaddd      ymm5, ymm5, ymm0
+    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
+    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
+    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
+    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
+    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jccolext-sse2.asm b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
new file mode 100644
index 0000000000..af70ed6010
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolext-sse2.asm
@@ -0,0 +1,484 @@
+;
+; jccolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                            int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  8
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
+
+EXTN(jsimd_rgb_ycc_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdx
+    push        rbx
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_ycc_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_ycc_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_ycc_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_ycc_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
+    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
+    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    pxor        xmm1, xmm1
+    pxor        xmm6, xmm6
+    punpcklwd   xmm1, xmm5              ; xmm1=BOL
+    punpckhwd   xmm6, xmm5              ; xmm6=BOH
+    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
+
+    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm1
+    paddd       xmm4, xmm6
+    paddd       xmm7, xmm5
+    paddd       xmm4, xmm5
+    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
+    packssdw    xmm7, xmm4              ; xmm7=CbO
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm6
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
+    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
+
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    pxor        xmm0, xmm0
+    pxor        xmm6, xmm6
+    punpcklwd   xmm0, xmm1              ; xmm0=BEL
+    punpckhwd   xmm6, xmm1              ; xmm6=BEH
+    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
+    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm5, xmm0
+    paddd       xmm4, xmm6
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
+    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
+    packssdw    xmm5, xmm4              ; xmm5=CbE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm5, xmm7              ; xmm5=Cb
+    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
+
+    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    movdqa      xmm7, xmm0
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, XMMWORD [wk(4)]
+    paddd       xmm4, XMMWORD [wk(5)]
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    pxor        xmm3, xmm3
+    pxor        xmm4, xmm4
+    punpcklwd   xmm3, xmm1              ; xmm3=ROL
+    punpckhwd   xmm4, xmm1              ; xmm4=ROH
+    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
+
+    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm7, xmm3
+    paddd       xmm5, xmm4
+    paddd       xmm7, xmm1
+    paddd       xmm5, xmm1
+    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
+    packssdw    xmm7, xmm5              ; xmm7=CrO
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
+    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(6)]
+    paddd       xmm4, XMMWORD [wk(7)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    pxor        xmm2, xmm2
+    pxor        xmm4, xmm4
+    punpcklwd   xmm2, xmm3              ; xmm2=REL
+    punpckhwd   xmm4, xmm3              ; xmm4=REH
+    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
+    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
+
+    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
+
+    paddd       xmm1, xmm2
+    paddd       xmm5, xmm4
+    paddd       xmm1, xmm0
+    paddd       xmm5, xmm0
+    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
+    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
+    packssdw    xmm1, xmm5              ; xmm1=CrE
+
+    psllw       xmm7, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Cr
+    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
+    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+    pop         rbx
+    pop         rdx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jccolor-avx2.asm b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
new file mode 100644
index 0000000000..16b78298dc
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-avx2.asm
@@ -0,0 +1,121 @@
+;
+; jccolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_avx2)
+
+EXTN(jconst_rgb_ycc_convert_avx2):
+
+PW_F0299_F0337  times 8 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 8 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 8 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 8 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 8 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 8 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extrgbx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extbgrx_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxbgr_ycc_convert_avx2
+%include "jccolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_avx2  jsimd_extxrgb_ycc_convert_avx2
+%include "jccolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jccolor-sse2.asm b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
new file mode 100644
index 0000000000..e2955c2134
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jccolor-sse2.asm
@@ -0,0 +1,120 @@
+;
+; jccolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_081 equ  5329                ; FIX(0.08131)
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_168 equ 11059                ; FIX(0.16874)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_331 equ 21709                ; FIX(0.33126)
+F_0_418 equ 27439                ; FIX(0.41869)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_ycc_convert_sse2)
+
+EXTN(jconst_rgb_ycc_convert_sse2):
+
+PW_F0299_F0337  times 4 dw  F_0_299,  F_0_337
+PW_F0114_F0250  times 4 dw  F_0_114,  F_0_250
+PW_MF016_MF033  times 4 dw -F_0_168, -F_0_331
+PW_MF008_MF041  times 4 dw -F_0_081, -F_0_418
+PD_ONEHALFM1_CJ times 4 dd  (1 << (SCALEBITS - 1)) - 1 + \
+                            (CENTERJSAMPLE << SCALEBITS)
+PD_ONEHALF      times 4 dd  (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extrgbx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extbgrx_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxbgr_ycc_convert_sse2
+%include "jccolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_ycc_convert_sse2  jsimd_extxrgb_ycc_convert_sse2
+%include "jccolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-avx2.asm b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
new file mode 100644
index 0000000000..591255bb11
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-avx2.asm
@@ -0,0 +1,113 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_avx2)
+
+EXTN(jconst_rgb_gray_convert_avx2):
+
+PW_F0299_F0337 times 8 dw F_0_299, F_0_337
+PW_F0114_F0250 times 8 dw F_0_114, F_0_250
+PD_ONEHALF     times 8 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extrgbx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extbgrx_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxbgr_gray_convert_avx2
+%include "jcgryext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_avx2  jsimd_extxrgb_gray_convert_avx2
+%include "jcgryext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgray-sse2.asm b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
new file mode 100644
index 0000000000..e389904f2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgray-sse2.asm
@@ -0,0 +1,112 @@
+;
+; jcgray.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_114 equ  7471                ; FIX(0.11400)
+F_0_250 equ 16384                ; FIX(0.25000)
+F_0_299 equ 19595                ; FIX(0.29900)
+F_0_587 equ 38470                ; FIX(0.58700)
+F_0_337 equ (F_0_587 - F_0_250)  ; FIX(0.58700) - FIX(0.25000)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_rgb_gray_convert_sse2)
+
+EXTN(jconst_rgb_gray_convert_sse2):
+
+PW_F0299_F0337 times 4 dw F_0_299, F_0_337
+PW_F0114_F0250 times 4 dw F_0_114, F_0_250
+PD_ONEHALF     times 4 dd (1 << (SCALEBITS - 1))
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extrgbx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extbgrx_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxbgr_gray_convert_sse2
+%include "jcgryext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_rgb_gray_convert_sse2  jsimd_extxrgb_gray_convert_sse2
+%include "jcgryext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jcgryext-avx2.asm b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
new file mode 100644
index 0000000000..ddcc2c0a2f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-avx2.asm
@@ -0,0 +1,438 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
+
+EXTN(jsimd_rgb_gray_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    vmovd       xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    vmovd       xmmF, XMM_DWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_DWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    sub         rcx, byte SIZEOF_XMMWORD
+    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    vpor        ymmA, ymmB
+.column_ld32:
+    test        cl, SIZEOF_YMMWORD
+    jz          short .column_ld64
+    sub         rcx, byte SIZEOF_YMMWORD
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+.column_ld64:
+    test        cl, 2*SIZEOF_YMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmB, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vmovdqu     ymmC, ymmA
+    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+
+    vmovdqa     ymmG, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
+                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
+    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
+                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
+                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
+    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
+                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
+
+    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
+                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
+    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
+                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
+
+    vmovdqa     ymmD, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
+                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
+    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
+                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
+                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
+    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
+                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
+
+    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
+                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
+    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
+                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
+
+    vmovdqa     ymmE, ymmA
+    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
+                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
+    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
+                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
+
+    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
+                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
+
+    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
+                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
+                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
+
+    vpxor       ymmH, ymmH, ymmH
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmB, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+
+    vmovdqa     ymmF, ymmD
+    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
+    vpor        xmmA, xmmA, xmmF
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    vmovdqa     xmmF, xmmA
+    vperm2i128  ymmF, ymmF, ymmF, 1
+    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+    vpor        ymmA, ymmA, ymmF
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_XMMWORD/2
+    vmovdqa     ymmF, ymmA
+    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_YMMWORD
+    jz          short .rgb_gray_cnv
+    vmovdqa     ymmE, ymmA
+    vmovdqa     ymmH, ymmF
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
+    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
+
+.rgb_gray_cnv:
+    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmB, ymmA
+    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+
+    vmovdqa     ymmB, ymmF
+    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    vmovdqa     ymmD, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
+                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
+    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
+                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
+
+    vmovdqa     ymmC, ymmF
+    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
+                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
+    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
+                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
+
+    vmovdqa     ymmB, ymmA
+    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
+                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
+    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
+                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
+
+    vmovdqa     ymmG, ymmD
+    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
+                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
+    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
+                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
+
+    vmovdqa     ymmE, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
+                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
+    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
+                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vmovdqa     ymmH, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
+                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
+    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
+                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
+
+    vpxor       ymmF, ymmF, ymmF
+
+    vmovdqa     ymmC, ymmA
+    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
+    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
+
+    vmovdqa     ymmD, ymmB
+    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
+    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
+
+    vmovdqa     ymmG, ymmE
+    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
+    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
+
+    vpunpcklbw  ymmF, ymmF, ymmH
+    vpunpckhbw  ymmH, ymmH, ymmH
+    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
+    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
+    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    vmovdqa     ymm6, ymm1
+    vpunpcklwd  ymm1, ymm1, ymm3
+    vpunpckhwd  ymm6, ymm6, ymm3
+    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    vmovdqa     ymm6, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm2
+    vpunpckhwd  ymm6, ymm6, ymm2
+    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    vmovdqa     ymm0, ymm5              ; ymm0=BO
+    vmovdqa     ymm6, ymm4              ; ymm6=BE
+
+    vmovdqa     ymm4, ymm0
+    vpunpcklwd  ymm0, ymm0, ymm3
+    vpunpckhwd  ymm4, ymm4, ymm3
+    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
+
+    vpaddd      ymm0, ymm0, ymm1
+    vpaddd      ymm4, ymm4, ymm7
+    vpaddd      ymm0, ymm0, ymm3
+    vpaddd      ymm4, ymm4, ymm3
+    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
+    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
+
+    vmovdqa     ymm4, ymm6
+    vpunpcklwd  ymm6, ymm6, ymm2
+    vpunpckhwd  ymm4, ymm4, ymm2
+    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
+
+    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
+    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
+    vpaddd      ymm6, ymm6, ymm2
+    vpaddd      ymm4, ymm4, ymm2
+    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
+    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
+    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
+    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_YMMWORD
+    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcgryext-sse2.asm b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
new file mode 100644
index 0000000000..f1d399a63b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcgryext-sse2.asm
@@ -0,0 +1,363 @@
+;
+; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
+;
+; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
+;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
+;                             int num_rows);
+;
+
+; r10d = JDIMENSION img_width
+; r11 = JSAMPARRAY input_buf
+; r12 = JSAMPIMAGE output_buf
+; r13d = JDIMENSION output_row
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
+
+EXTN(jsimd_rgb_gray_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rsi, r12
+    mov         ecx, r13d
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rsi, r11
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rdi
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+.column_ld1:
+    push        rax
+    push        rdx
+    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
+    test        cl, SIZEOF_BYTE
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_BYTE
+    movzx       rax, byte [rsi+rcx]
+.column_ld2:
+    test        cl, SIZEOF_WORD
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_WORD
+    movzx       rdx, word [rsi+rcx]
+    shl         rax, WORD_BIT
+    or          rax, rdx
+.column_ld4:
+    movd        xmmA, eax
+    pop         rdx
+    pop         rax
+    test        cl, SIZEOF_DWORD
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_DWORD
+    movd        xmmF, XMM_DWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_DWORD
+    por         xmmA, xmmF
+.column_ld8:
+    test        cl, SIZEOF_MMWORD
+    jz          short .column_ld16
+    sub         rcx, byte SIZEOF_MMWORD
+    movq        xmmB, XMM_MMWORD [rsi+rcx]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmB
+.column_ld16:
+    test        cl, SIZEOF_XMMWORD
+    jz          short .column_ld32
+    movdqa      xmmF, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .rgb_gray_cnv
+.column_ld32:
+    test        cl, 2*SIZEOF_XMMWORD
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmB, xmmA
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    movdqa      xmmG, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
+    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
+    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
+
+    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
+    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
+
+    movdqa      xmmD, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
+    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
+    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
+
+    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
+    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
+
+    movdqa      xmmE, xmmA
+    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
+    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
+
+    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
+
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
+
+    pxor        xmmH, xmmH
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmB, xmmE
+    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
+
+    movdqa      xmmF, xmmD
+    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+.column_ld1:
+    test        cl, SIZEOF_XMMWORD/16
+    jz          short .column_ld2
+    sub         rcx, byte SIZEOF_XMMWORD/16
+    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld2:
+    test        cl, SIZEOF_XMMWORD/8
+    jz          short .column_ld4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
+    pslldq      xmmA, SIZEOF_MMWORD
+    por         xmmA, xmmE
+.column_ld4:
+    test        cl, SIZEOF_XMMWORD/4
+    jz          short .column_ld8
+    sub         rcx, byte SIZEOF_XMMWORD/4
+    movdqa      xmmE, xmmA
+    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
+.column_ld8:
+    test        cl, SIZEOF_XMMWORD/2
+    mov         rcx, SIZEOF_XMMWORD
+    jz          short .rgb_gray_cnv
+    movdqa      xmmF, xmmA
+    movdqa      xmmH, xmmE
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    jmp         short .rgb_gray_cnv
+
+.columnloop:
+    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
+    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
+
+.rgb_gray_cnv:
+    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
+    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
+
+    movdqa      xmmC, xmmF
+    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
+    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
+
+    movdqa      xmmB, xmmA
+    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
+    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
+
+    movdqa      xmmG, xmmD
+    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
+    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
+
+    movdqa      xmmE, xmmA
+    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
+    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
+
+    movdqa      xmmH, xmmB
+    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
+    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
+
+    pxor        xmmF, xmmF
+
+    movdqa      xmmC, xmmA
+    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
+    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
+
+    movdqa      xmmD, xmmB
+    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
+    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
+
+    movdqa      xmmG, xmmE
+    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
+    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
+
+    punpcklbw   xmmF, xmmH
+    punpckhbw   xmmH, xmmH
+    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
+    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
+    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
+
+    ; (Original)
+    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    ;
+    ; (This implementation)
+    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+
+    movdqa      xmm6, xmm1
+    punpcklwd   xmm1, xmm3
+    punpckhwd   xmm6, xmm3
+    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
+
+    movdqa      xmm6, xmm0
+    punpcklwd   xmm0, xmm2
+    punpckhwd   xmm6, xmm2
+    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
+    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
+
+    movdqa      xmm0, xmm5              ; xmm0=BO
+    movdqa      xmm6, xmm4              ; xmm6=BE
+
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm4, xmm3
+    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
+
+    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
+
+    paddd       xmm0, xmm1
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm3
+    paddd       xmm4, xmm3
+    psrld       xmm0, SCALEBITS         ; xmm0=YOL
+    psrld       xmm4, SCALEBITS         ; xmm4=YOH
+    packssdw    xmm0, xmm4              ; xmm0=YO
+
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm6, xmm2
+    punpckhwd   xmm4, xmm2
+    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
+    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
+
+    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
+
+    paddd       xmm6, XMMWORD [wk(0)]
+    paddd       xmm4, XMMWORD [wk(1)]
+    paddd       xmm6, xmm2
+    paddd       xmm4, xmm2
+    psrld       xmm6, SCALEBITS         ; xmm6=YEL
+    psrld       xmm4, SCALEBITS         ; xmm4=YEH
+    packssdw    xmm6, xmm4              ; xmm6=YE
+
+    psllw       xmm0, BYTE_BIT
+    por         xmm6, xmm0              ; xmm6=Y
+    movdqa      XMMWORD [rdi], xmm6     ; Save Y
+
+    sub         rcx, byte SIZEOF_XMMWORD
+    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .column_ld1
+
+    pop         rcx                     ; col
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
+    add         rdi, byte SIZEOF_JSAMPROW
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jchuff-sse2.asm b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
new file mode 100644
index 0000000000..9ea6df946e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jchuff-sse2.asm
@@ -0,0 +1,583 @@
+;
+; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
+;
+; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, D. R. Commander.
+; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation for Huffman coding of one block.
+; The following code is based on jchuff.c; see jchuff.c for more details.
+
+%include "jsimdext.inc"
+
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_huff_encode_one_block)
+
+EXTN(jconst_huff_encode_one_block):
+
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+               dd 0x000f, 0x001f, 0x003f, 0x007f
+               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+    alignz      32
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
+times 1 << 15 db 16
+
+    alignz      32
+
+%define NBITS(x)      nbits_base + x
+%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values
+; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+    add         nbitsb, free_bitsb      ; nbits += free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    mov         tempd, code             ; temp = code;
+    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
+    mov         nbitsb, free_bitsb      ; nbits = free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    shr         tempd, nbitsb           ; temp >>= nbits;
+    or          tempq, put_buffer       ; temp |= put_buffer;
+    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
+    bswap       tempq                   ; temp = htonl(temp);
+    mov         put_buffer, codeq       ; put_buffer = code;
+    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+    %2
+    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
+    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
+                                        ; (speculative; will be overwritten if
+                                        ; code contains any 0xFF bytes)
+    add         free_bitsb, 64          ; free_bits += 64;
+    add         bufferp, 8              ; buffer += 8;
+    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
+    jz          %1                      ;   return;
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
+    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempd, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    jmp         %1                      ; return;
+%endmacro
+
+;
+; Encode a single block's worth of coefficients.
+;
+; GLOBAL(JOCTET *)
+; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
+;                                  JCOEFPTR block, int last_dc_val,
+;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8  - dctbl --> code_temp
+; r9  - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer       rax
+%ifdef WIN64
+%define bufferp      rax
+%else
+%define bufferp      raxp
+%endif
+%define tempq        rbx
+%define tempd        ebx
+%define tempb        bl
+%define temph        bh
+%define nbitsq       rcx
+%define nbits        ecx
+%define nbitsb       cl
+%define block        rdx
+%define nbits_base   rsi
+%define t            rdi
+%define td           edi
+%define codeq        rbp
+%define code         ebp
+%define dctbl        r8
+%define actbl        r9
+%define state        r10
+%define index        r11
+%define indexd       r11d
+%define put_buffer   r12
+%define put_bufferd  r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
+
+EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    mov         buffer, rdx
+    mov         block, r8
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        rsi
+    push        rdi
+    push        r12
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    mov         state, rcx
+    movsx       code, word [block]                        ;Z:     code = block[0];
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         code, r9d                                 ;Z:     code -= last_dc_val;
+    mov         dctbl, POINTER [rsp+6*8+4*8]
+    mov         actbl, POINTER [rsp+6*8+5*8]
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         nbits_base, [rel jpeg_nbits_table]
+    add         rsp, -DCTSIZE2 * SIZEOF_WORD
+    mov         t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        r12
+    mov         state, rdi
+    mov         buffer, rsi
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    movsx       codeq, word [block]                       ;Z:     code = block[0];
+    lea         nbits_base, [rel jpeg_nbits_table]
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
+
+%endif
+
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
+                                                          ;Z:     i.e. if code is positive
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    movsxd      codeq, code                               ;Z:     sign extend code
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
+    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         tempd, 16                                 ;Z:     temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+                                                          ;Z:     temp = dctbl->ehufco[nbits];
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    or          code, tempd                               ;Z:     code |= temp;
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define free_bitsq  rdx
+%define free_bitsd  edx
+%define free_bitsb  dl
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    shl         tempq, 48                                 ;Z:     temp <<= 48;
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
+    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
+    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp  r8d
+    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
+    mov         free_bitsd, [state+working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
+    shl         index, 32                                 ;Z:     index <<= 32;
+    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    or          index, tempq                              ;Z:     index |= temp;
+    not         index                                     ;Z:     index = ~index;
+    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
+    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
+    align       16
+.EMIT_CODE:                                               ;Z:     .EMIT_CODE:
+    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.BRLOOP:                                                  ; do {
+    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
+    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
+    mov         nbits, code_temp                          ;   nbits = code_temp;
+    or          put_buffer, codeq                         ;   put_buffer |= code;
+    cmp         nbits, 16                                 ;   if (nbits <= 16)
+    jle         .ERLOOP                                   ;     break;
+    jmp         .BRLOOP                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+    times 5     nop
+.ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
+    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
+    or          put_buffer, codeq                         ; put_buffer |= code;
+.BLOOP_COND:                                              ; .BLOOP_COND:
+    test        index, index                              ; if (index != 0)
+    jz          .ELOOP                                    ; {
+.BLOOP:                                                   ;   do {
+    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
+    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
+    inc         nbits                                     ;     ++nbits;
+    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
+    shr         index, nbitsb                             ;     index >>= nbits;
+.EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
+    cmp         nbits, 16                                 ;     if (nbits > 16)
+    jg          .BRLOOP                                   ;       goto .BRLOOP;
+.ERLOOP:                                                  ; .ERLOOP:
+    movsx       codeq, word [t]                           ;     code = *t;
+    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
+    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
+    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
+    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+                                                          ;     code_temp = actbl->ehufco[temp-16];
+    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
+    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
+    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+                                                          ;     free_bits -= actbl->ehufsi[temp-16];
+    or          code, code_temp                           ;     code |= code_temp;
+    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP                                    ;   } while (index != 0);
+.ELOOP:                                                   ; }  /* index != 0 */
+    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
+%else
+    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
+%endif
+    je          .EFN                                      ; {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+.EFN:                                                     ; } }
+    mov         [state + working_state.cur.put_buffer.simd], put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    mov         byte [state + working_state.cur.free_bits], free_bitsb
+                                                          ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
+    pop         r12
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+%else
+    pop         r12
+    pop         rbp
+    pop         rbx
+%endif
+    ret
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP_CODE:
+    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+                                                          ; insert code, flush buffer,
+                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcphuff-sse2.asm b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
new file mode 100644
index 0000000000..01b5c0235f
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcphuff-sse2.asm
@@ -0,0 +1,639 @@
+;
+; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
+; (64-bit SSE2)
+;
+; Copyright (C) 2016, 2018, Matthieu Darbois
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains an SSE2 implementation of data preparation for progressive
+; Huffman encoding.  See jcphuff.c for more details.
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+; --------------------------------------------------------------------------
+; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
+; jsimd_encode_mcu_AC_refine_prepare_sse2()
+
+%macro LOAD16 0
+    pxor        N0, N0
+    pxor        N1, N1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    mov         T1d, INT [LUT + 15*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+    pinsrw      X1, word [BLOCK + T1 * 2], 7
+%endmacro
+
+%macro LOAD15 0
+    pxor        N0, N0
+    pxor        N1, N1
+    pxor        X1, X1
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    mov         T1d, INT [LUT +  8*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+    pinsrw      X1, word [BLOCK + T1 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT +  9*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 10*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 11*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 12*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 13*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD15
+    mov         T1d, INT [LUT + 14*SIZEOF_INT]
+    pinsrw      X1, word [BLOCK + T1 * 2], 6
+%%.ELOAD15:
+%endmacro
+
+%macro LOAD8 0
+    pxor        N0, N0
+
+    mov         T0d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 0
+
+    mov         T0d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 1
+
+    mov         T0d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 2
+
+    mov         T0d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 3
+
+    mov         T0d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 4
+
+    mov         T0d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 5
+
+    mov         T0d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 6
+
+    mov         T0d, INT [LUT +  7*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T0 * 2], 7
+%endmacro
+
+%macro LOAD7 0
+    pxor        N0, N0
+    pxor        X0, X0
+
+    mov         T1d, INT [LUT +  0*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 0
+
+    cmp         LENEND, 2
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  1*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 1
+
+    cmp         LENEND, 3
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  2*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 2
+
+    cmp         LENEND, 4
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  3*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 3
+
+    cmp         LENEND, 5
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  4*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 4
+
+    cmp         LENEND, 6
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  5*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 5
+
+    cmp         LENEND, 7
+    jl          %%.ELOAD7
+    mov         T1d, INT [LUT +  6*SIZEOF_INT]
+    pinsrw      X0, word [BLOCK + T1 * 2], 6
+%%.ELOAD7:
+%endmacro
+
+%macro REDUCE0 0
+    movdqa      xmm0, XMMWORD [VALUES + ( 0*2)]
+    movdqa      xmm1, XMMWORD [VALUES + ( 8*2)]
+    movdqa      xmm2, XMMWORD [VALUES + (16*2)]
+    movdqa      xmm3, XMMWORD [VALUES + (24*2)]
+    movdqa      xmm4, XMMWORD [VALUES + (32*2)]
+    movdqa      xmm5, XMMWORD [VALUES + (40*2)]
+    movdqa      xmm6, XMMWORD [VALUES + (48*2)]
+    movdqa      xmm7, XMMWORD [VALUES + (56*2)]
+
+    pcmpeqw     xmm0, ZERO
+    pcmpeqw     xmm1, ZERO
+    pcmpeqw     xmm2, ZERO
+    pcmpeqw     xmm3, ZERO
+    pcmpeqw     xmm4, ZERO
+    pcmpeqw     xmm5, ZERO
+    pcmpeqw     xmm6, ZERO
+    pcmpeqw     xmm7, ZERO
+
+    packsswb    xmm0, xmm1
+    packsswb    xmm2, xmm3
+    packsswb    xmm4, xmm5
+    packsswb    xmm6, xmm7
+
+    pmovmskb    eax, xmm0
+    pmovmskb    ecx, xmm2
+    pmovmskb    edx, xmm4
+    pmovmskb    esi, xmm6
+
+    shl         rcx, 16
+    shl         rdx, 32
+    shl         rsi, 48
+
+    or          rax, rcx
+    or          rdx, rsi
+    or          rax, rdx
+
+    not         rax
+
+    mov         MMWORD [r15], rax
+%endmacro
+
+;
+; Prepare data for jsimd_encode_mcu_AC_first().
+;
+; GLOBAL(void)
+; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
+;                                        const int *jpeg_natural_order_start,
+;                                        int Sl, int Al, JCOEF *values,
+;                                        size_t *zerobits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *zerobits
+
+%define ZERO    xmm9
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOP16
+.BLOOP16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    dec         K
+    jnz         .BLOOP16
+    test        LEN, 15
+    je          .PADDING
+.ELOOP16:
+    test        LEN, 8
+    jz          .TRY7
+    test        LEN, 7
+    jz          .TRY8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    pxor        N0, X0
+    pxor        N1, X1
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    movdqa      XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
+    add         VALUES, 16*2
+    jmp         .PADDING
+.TRY8:
+    LOAD8
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+    jmp         .PADDING
+.TRY7:
+    LOAD7
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    pxor        N0, X0
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
+    add         VALUES, 8*2
+.PADDING:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDING
+    align       16
+.ZEROLOOP:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOP
+.EPADDING:
+    sub         VALUES, DCTSIZE2*2
+
+    REDUCE0
+
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+;
+; Prepare data for jsimd_encode_mcu_AC_refine().
+;
+; GLOBAL(int)
+; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
+;                                         const int *jpeg_natural_order_start,
+;                                         int Sl, int Al, JCOEF *absvalues,
+;                                         size_t *bits)
+;
+; r10 = const JCOEF *block
+; r11 = const int *jpeg_natural_order_start
+; r12 = int Sl
+; r13 = int Al
+; r14 = JCOEF *values
+; r15 = size_t *bits
+
+%define ZERO    xmm9
+%define ONE     xmm5
+%define X0      xmm0
+%define X1      xmm1
+%define N0      xmm2
+%define N1      xmm3
+%define AL      xmm4
+%define K       eax
+%define KK      r9d
+%define EOB     r8d
+%define SIGN    rdi
+%define LUT     r11
+%define T0      rcx
+%define T0d     ecx
+%define T1      rdx
+%define T1d     edx
+%define BLOCK   r10
+%define VALUES  r14
+%define LEN     r12d
+%define LENEND  r13d
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
+
+EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [rbp - 16]
+    collect_args 6
+
+    movdqa      XMMWORD [rbp - 16], ZERO
+
+    xor         SIGN, SIGN
+    xor         EOB, EOB
+    xor         KK, KK
+    movd        AL, r13d
+    pxor        ZERO, ZERO
+    pcmpeqw     ONE, ONE
+    psrlw       ONE, 15
+    mov         K, LEN
+    mov         LENEND, LEN
+    and         K, -16
+    and         LENEND, 7
+    shr         K, 4
+    jz          .ELOOPR16
+.BLOOPR16:
+    LOAD16
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER16            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER16:
+    add         VALUES, 16*2
+    add         LUT, 16*SIZEOF_INT
+    add         KK, 16
+    dec         K
+    jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
+.ELOOPR16:
+    test        LEN, 8
+    jz          .TRYR7
+    test        LEN, 7
+    jz          .TRYR8
+
+    LOAD15
+    pcmpgtw     N0, X0
+    pcmpgtw     N1, X1
+    paddw       X0, N0
+    paddw       X1, N1
+    pxor        X0, N0
+    pxor        X1, N1
+    psrlw       X0, AL
+    psrlw       X1, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    movdqa      XMMWORD [VALUES + (8) * 2], X1
+    pcmpeqw     X0, ONE
+    pcmpeqw     X1, ONE
+    packsswb    N0, N1
+    packsswb    X0, X1
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 16                ; make room for sizebits
+    shl         T0, 48
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER15            ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER15:
+    add         VALUES, 16*2
+    jmp         .PADDINGR
+.TRYR8:
+    LOAD8
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ;  idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER8             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER8:
+    add         VALUES, 8*2
+    jmp         .PADDINGR
+.TRYR7:
+    LOAD7
+
+    pcmpgtw     N0, X0
+    paddw       X0, N0
+    pxor        X0, N0
+    psrlw       X0, AL
+    movdqa      XMMWORD [VALUES + (0) * 2], X0
+    pcmpeqw     X0, ONE
+    packsswb    N0, ZERO
+    packsswb    X0, ZERO
+    pmovmskb    T0d, N0                 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
+    pmovmskb    T1d, X0                 ; idx = _mm_movemask_epi8(x1);
+    shr         SIGN, 8                 ; make room for sizebits
+    shl         T0, 56
+    or          SIGN, T0
+    bsr         T1d, T1d                ; idx = 16 - (__builtin_clz(idx)>>1);
+    jz          .CONTINUER7             ; if (idx) {
+    mov         EOB, KK
+    add         EOB, T1d                ; EOB = k + idx;
+.CONTINUER7:
+    add         VALUES, 8*2
+.PADDINGR:
+    mov         K, LEN
+    add         K, 7
+    and         K, -8
+    shr         K, 3
+    sub         K, DCTSIZE2/8
+    jz          .EPADDINGR
+    align       16
+.ZEROLOOPR:
+    movdqa      XMMWORD [VALUES + 0], ZERO
+    shr         SIGN, 8
+    add         VALUES, 8*2
+    inc         K
+    jnz         .ZEROLOOPR
+.EPADDINGR:
+    not         SIGN
+    sub         VALUES, DCTSIZE2*2
+    mov         MMWORD [r15+SIZEOF_MMWORD], SIGN
+
+    REDUCE0
+
+    mov         eax, EOB
+    movdqa      ZERO, XMMWORD [rbp - 16]
+    uncollect_args 6
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+%undef ZERO
+%undef ONE
+%undef X0
+%undef X1
+%undef N0
+%undef N1
+%undef AL
+%undef K
+%undef KK
+%undef EOB
+%undef SIGN
+%undef LUT
+%undef T0
+%undef T0d
+%undef T1
+%undef T1d
+%undef BLOCK
+%undef VALUES
+%undef LEN
+%undef LENEND
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcsample-avx2.asm b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
new file mode 100644
index 0000000000..b32527aebe
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-avx2.asm
@@ -0,0 +1,367 @@
+;
+; jcsample.asm - downsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
+
+EXTN(jsimd_h2v1_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    vmovd       xmm7, edx
+    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    ; rcx can possibly be 8, 16, 24
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm1, ymm1, ymm1
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpsrlw      ymm2, ymm0, BYTE_BIT
+    vpand       ymm0, ymm0, ymm6
+    vpsrlw      ymm3, ymm1, BYTE_BIT
+    vpand       ymm1, ymm1, ymm6
+
+    vpaddw      ymm0, ymm0, ymm2
+    vpaddw      ymm1, ymm1, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpsrlw      ymm0, ymm0, 1
+    vpsrlw      ymm1, ymm1, 1
+
+    vpackuswb   ymm0, ymm0, ymm1
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
+
+EXTN(jsimd_h2v2_downsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    vmovd       xmm7, edx
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
+    vperm2i128  ymm7, ymm7, ymm7, 0
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         short .columnloop
+
+.columnloop_r24:
+    cmp         rcx, 24
+    jne         .columnloop_r16
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r16:
+    cmp         rcx, 16
+    jne         .columnloop_r8
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop_r8:
+    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpxor       ymm2, ymm2, ymm2
+    vpxor       ymm3, ymm3, ymm3
+    mov         rcx, SIZEOF_YMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+
+.downsample:
+    vpand       ymm4, ymm0, ymm6
+    vpsrlw      ymm0, ymm0, BYTE_BIT
+    vpand       ymm5, ymm1, ymm6
+    vpsrlw      ymm1, ymm1, BYTE_BIT
+    vpaddw      ymm0, ymm0, ymm4
+    vpaddw      ymm1, ymm1, ymm5
+
+    vpand       ymm4, ymm2, ymm6
+    vpsrlw      ymm2, ymm2, BYTE_BIT
+    vpand       ymm5, ymm3, ymm6
+    vpsrlw      ymm3, ymm3, BYTE_BIT
+    vpaddw      ymm2, ymm2, ymm4
+    vpaddw      ymm3, ymm3, ymm5
+
+    vpaddw      ymm0, ymm0, ymm1
+    vpaddw      ymm2, ymm2, ymm3
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpsrlw      ymm0, ymm0, 2
+    vpsrlw      ymm2, ymm2, 2
+
+    vpackuswb   ymm0, ymm0, ymm2
+    vpermq      ymm0, ymm0, 0xd8
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+
+    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r24
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jcsample-sse2.asm b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
new file mode 100644
index 0000000000..2fcfe4567a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jcsample-sse2.asm
@@ -0,0 +1,330 @@
+;
+; jcsample.asm - downsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Downsample pixel values of a single component.
+; This version handles the common case of 2:1 horizontal and 1:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
+
+EXTN(jsimd_h2v1_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v1_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        eax, eax
+    jle         near .return
+
+    mov         rdx, 0x00010000         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm1, xmm1
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    pand        xmm0, xmm6
+    psrlw       xmm2, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm3, BYTE_BIT
+
+    paddw       xmm0, xmm2
+    paddw       xmm1, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    psrlw       xmm0, 1
+    psrlw       xmm1, 1
+
+    packuswb    xmm0, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+    test        rcx, rcx
+    jnz         short .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rax                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Downsample pixel values of a single component.
+; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+; without smoothing.
+;
+; GLOBAL(void)
+; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
+;                            JDIMENSION v_samp_factor,
+;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
+;                            JSAMPARRAY output_data);
+;
+
+; r10d = JDIMENSION image_width
+; r11 = int max_v_samp_factor
+; r12d = JDIMENSION v_samp_factor
+; r13d = JDIMENSION width_in_blocks
+; r14 = JSAMPARRAY input_data
+; r15 = JSAMPARRAY output_data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
+
+EXTN(jsimd_h2v2_downsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 6
+
+    mov         ecx, r13d
+    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
+    jz          near .return
+
+    mov         edx, r10d
+
+    ; -- expand_right_edge
+
+    push        rcx
+    shl         rcx, 1                  ; output_cols * 2
+    sub         rcx, rdx
+    jle         short .expand_end
+
+    mov         rax, r11
+    test        rax, rax
+    jle         short .expand_end
+
+    cld
+    mov         rsi, r14                ; input_data
+.expandloop:
+    push        rax
+    push        rcx
+
+    mov         rdip, JSAMPROW [rsi]
+    add         rdi, rdx
+    mov         al, JSAMPLE [rdi-1]
+
+    rep stosb
+
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    dec         rax
+    jg          short .expandloop
+
+.expand_end:
+    pop         rcx                     ; output_cols
+
+    ; -- h2v2_downsample
+
+    mov         eax, r12d               ; rowctr
+    test        rax, rax
+    jle         near .return
+
+    mov         rdx, 0x00020001         ; bias pattern
+    movd        xmm7, edx
+    pcmpeqw     xmm6, xmm6
+    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+
+    mov         rsi, r14                ; input_data
+    mov         rdi, r15                ; output_data
+.rowloop:
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         short .columnloop
+
+.columnloop_r8:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    pxor        xmm2, xmm2
+    pxor        xmm3, xmm3
+    mov         rcx, SIZEOF_XMMWORD
+    jmp         short .downsample
+
+.columnloop:
+    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+.downsample:
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    pand        xmm0, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm1, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm0, xmm4
+    paddw       xmm1, xmm5
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    pand        xmm2, xmm6
+    psrlw       xmm4, BYTE_BIT
+    pand        xmm3, xmm6
+    psrlw       xmm5, BYTE_BIT
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+
+    paddw       xmm0, xmm1
+    paddw       xmm2, xmm3
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm7
+    psrlw       xmm0, 2
+    psrlw       xmm2, 2
+
+    packuswb    xmm0, xmm2
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+
+    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
+    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jae         near .columnloop
+    test        rcx, rcx
+    jnz         near .columnloop_r8
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
+    dec         rax                          ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 6
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-avx2.asm b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
new file mode 100644
index 0000000000..2370fda642
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-avx2.asm
@@ -0,0 +1,496 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_avx2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_avx2)
+
+EXTN(jsimd_ycc_rgb_convert_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    vmovdqu     ymm5, YMMWORD [rbx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm1, YMMWORD [rdx]     ; ymm1=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm0, ymm0, ymm0
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsrlw      ymm0, ymm0, BYTE_BIT    ; ymm0={0xFF 0x00 0xFF 0x00 ..}
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpand       ymm4, ymm0, ymm5        ; ymm4=Cb(02468ACEGIKMOQSU)=CbE
+    vpsrlw      ymm5, ymm5, BYTE_BIT    ; ymm5=Cb(13579BDFHJLNPRTV)=CbO
+    vpand       ymm0, ymm0, ymm1        ; ymm0=Cr(02468ACEGIKMOQSU)=CrE
+    vpsrlw      ymm1, ymm1, BYTE_BIT    ; ymm1=Cr(13579BDFHJLNPRTV)=CrO
+
+    vpaddw      ymm2, ymm4, ymm7
+    vpaddw      ymm3, ymm5, ymm7
+    vpaddw      ymm6, ymm0, ymm7
+    vpaddw      ymm7, ymm1, ymm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbE
+    vpaddw      ymm5, ymm3, ymm3             ; ymm5=2*CbO
+    vpaddw      ymm0, ymm6, ymm6             ; ymm0=2*CrE
+    vpaddw      ymm1, ymm7, ymm7             ; ymm1=2*CrO
+
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbE * -FIX(0.22800))
+    vpmulhw     ymm5, ymm5, [rel PW_MF0228]  ; ymm5=(2*CbO * -FIX(0.22800))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrE * FIX(0.40200))
+    vpmulhw     ymm1, ymm1, [rel PW_F0402]   ; ymm1=(2*CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbE * -FIX(0.22800))
+    vpsraw      ymm5, ymm5, 1                ; ymm5=(CbO * -FIX(0.22800))
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpaddw      ymm1, ymm1, [rel PW_ONE]
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrE * FIX(0.40200))
+    vpsraw      ymm1, ymm1, 1                ; ymm1=(CrO * FIX(0.40200))
+
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm5, ymm5, ymm3
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbE * FIX(1.77200))=(B-Y)E
+    vpaddw      ymm5, ymm5, ymm3             ; ymm5=(CbO * FIX(1.77200))=(B-Y)O
+    vpaddw      ymm0, ymm0, ymm6             ; ymm0=(CrE * FIX(1.40200))=(R-Y)E
+    vpaddw      ymm1, ymm1, ymm7             ; ymm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    vmovdqa     YMMWORD [wk(0)], ymm4        ; wk(0)=(B-Y)E
+    vmovdqa     YMMWORD [wk(1)], ymm5        ; wk(1)=(B-Y)O
+
+    vpunpckhwd  ymm4, ymm2, ymm6
+    vpunpcklwd  ymm2, ymm2, ymm6
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm4, ymm4, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm5, ymm3, ymm7
+    vpunpcklwd  ymm3, ymm3, ymm7
+    vpmaddwd    ymm3, ymm3, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm4, ymm4, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm4, ymm4, SCALEBITS
+    vpaddd      ymm3, ymm3, [rel PD_ONEHALF]
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpsrad      ymm3, ymm3, SCALEBITS
+    vpsrad      ymm5, ymm5, SCALEBITS
+
+    vpackssdw   ymm2, ymm2, ymm4             ; ymm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    vpackssdw   ymm3, ymm3, ymm5             ; ymm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    vpsubw      ymm2, ymm2, ymm6             ; ymm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    vpsubw      ymm3, ymm3, ymm7             ; ymm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    vmovdqu     ymm5, YMMWORD [rsi]          ; ymm5=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm4, ymm4, ymm4
+    vpsrlw      ymm4, ymm4, BYTE_BIT         ; ymm4={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm4, ymm4, ymm5             ; ymm4=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm5, ymm5, BYTE_BIT         ; ymm5=Y(13579BDFHJLNPRTV)=YO
+
+    vpaddw      ymm0, ymm0, ymm4             ; ymm0=((R-Y)E+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm5             ; ymm1=((R-Y)O+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0             ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1             ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm4             ; ymm2=((G-Y)E+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm5             ; ymm3=((G-Y)O+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2             ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3             ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, YMMWORD [wk(0)]  ; ymm4=(YE+(B-Y)E)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, YMMWORD [wk(1)]  ; ymm5=(YO+(B-Y)O)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4             ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5             ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolext-sse2.asm b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
new file mode 100644
index 0000000000..e07c8d7518
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolext-sse2.asm
@@ -0,0 +1,439 @@
+;
+; jdcolext.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Convert some rows of samples to the output colorspace.
+;
+; GLOBAL(void)
+; jsimd_ycc_rgb_convert_sse2(JDIMENSION out_width, JSAMPIMAGE input_buf,
+;                            JDIMENSION input_row, JSAMPARRAY output_buf,
+;                            int num_rows)
+;
+
+; r10d = JDIMENSION out_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION input_row
+; r13 = JSAMPARRAY output_buf
+; r14d = int num_rows
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_ycc_rgb_convert_sse2)
+
+EXTN(jsimd_ycc_rgb_convert_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 5
+    push        rbx
+
+    mov         ecx, r10d               ; num_cols
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
+    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
+
+    pop         rcx
+
+    mov         rdi, r13
+    mov         eax, r14d
+    test        rax, rax
+    jle         near .return
+.rowloop:
+    push        rax
+    push        rdi
+    push        rdx
+    push        rbx
+    push        rsi
+    push        rcx                     ; col
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+.columnloop:
+
+    movdqa      xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
+    movdqa      xmm1, XMMWORD [rdx]     ; xmm1=Cr(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    pcmpeqw     xmm7, xmm7
+    psrlw       xmm4, BYTE_BIT
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+    movdqa      xmm0, xmm4              ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..}
+
+    pand        xmm4, xmm5              ; xmm4=Cb(02468ACE)=CbE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Cb(13579BDF)=CbO
+    pand        xmm0, xmm1              ; xmm0=Cr(02468ACE)=CrE
+    psrlw       xmm1, BYTE_BIT          ; xmm1=Cr(13579BDF)=CrO
+
+    paddw       xmm4, xmm7
+    paddw       xmm5, xmm7
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm2, xmm4              ; xmm2=CbE
+    movdqa      xmm3, xmm5              ; xmm3=CbO
+    paddw       xmm4, xmm4              ; xmm4=2*CbE
+    paddw       xmm5, xmm5              ; xmm5=2*CbO
+    movdqa      xmm6, xmm0              ; xmm6=CrE
+    movdqa      xmm7, xmm1              ; xmm7=CrO
+    paddw       xmm0, xmm0              ; xmm0=2*CrE
+    paddw       xmm1, xmm1              ; xmm1=2*CrO
+
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbE * -FIX(0.22800))
+    pmulhw      xmm5, [rel PW_MF0228]   ; xmm5=(2*CbO * -FIX(0.22800))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrE * FIX(0.40200))
+    pmulhw      xmm1, [rel PW_F0402]    ; xmm1=(2*CrO * FIX(0.40200))
+
+    paddw       xmm4, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    psraw       xmm4, 1                 ; xmm4=(CbE * -FIX(0.22800))
+    psraw       xmm5, 1                 ; xmm5=(CbO * -FIX(0.22800))
+    paddw       xmm0, [rel PW_ONE]
+    paddw       xmm1, [rel PW_ONE]
+    psraw       xmm0, 1                 ; xmm0=(CrE * FIX(0.40200))
+    psraw       xmm1, 1                 ; xmm1=(CrO * FIX(0.40200))
+
+    paddw       xmm4, xmm2
+    paddw       xmm5, xmm3
+    paddw       xmm4, xmm2              ; xmm4=(CbE * FIX(1.77200))=(B-Y)E
+    paddw       xmm5, xmm3              ; xmm5=(CbO * FIX(1.77200))=(B-Y)O
+    paddw       xmm0, xmm6              ; xmm0=(CrE * FIX(1.40200))=(R-Y)E
+    paddw       xmm1, xmm7              ; xmm1=(CrO * FIX(1.40200))=(R-Y)O
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=(B-Y)E
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(B-Y)O
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm4, xmm6
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm4, [rel PW_MF0344_F0285]
+    punpcklwd   xmm3, xmm7
+    punpckhwd   xmm5, xmm7
+    pmaddwd     xmm3, [rel PW_MF0344_F0285]
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm4, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm4, SCALEBITS
+    paddd       xmm3, [rel PD_ONEHALF]
+    paddd       xmm5, [rel PD_ONEHALF]
+    psrad       xmm3, SCALEBITS
+    psrad       xmm5, SCALEBITS
+
+    packssdw    xmm2, xmm4              ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
+    packssdw    xmm3, xmm5              ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
+    psubw       xmm2, xmm6              ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
+    psubw       xmm3, xmm7              ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
+
+    movdqa      xmm5, XMMWORD [rsi]     ; xmm5=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm4, xmm4
+    psrlw       xmm4, BYTE_BIT          ; xmm4={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm4, xmm5              ; xmm4=Y(02468ACE)=YE
+    psrlw       xmm5, BYTE_BIT          ; xmm5=Y(13579BDF)=YO
+
+    paddw       xmm0, xmm4              ; xmm0=((R-Y)E+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm5              ; xmm1=((R-Y)O+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm4              ; xmm2=((G-Y)E+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm5              ; xmm3=((G-Y)O+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, XMMWORD [wk(0)]   ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE)
+    paddw       xmm5, XMMWORD [wk(1)]   ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .nextrow
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .nextrow
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.nextrow:
+    pop         rcx
+    pop         rsi
+    pop         rbx
+    pop         rdx
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW
+    add         rbx, byte SIZEOF_JSAMPROW
+    add         rdx, byte SIZEOF_JSAMPROW
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_buf
+    dec         rax                        ; num_rows
+    jg          near .rowloop
+
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 5
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdcolor-avx2.asm b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
new file mode 100644
index 0000000000..43de9db04d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-avx2.asm
@@ -0,0 +1,118 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_avx2)
+
+EXTN(jconst_ycc_rgb_convert_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgb_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extrgbx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extbgrx_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxbgr_convert_avx2
+%include "jdcolext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_avx2  jsimd_ycc_extxrgb_convert_avx2
+%include "jdcolext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdcolor-sse2.asm b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
new file mode 100644
index 0000000000..b3f1fec07e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdcolor-sse2.asm
@@ -0,0 +1,117 @@
+;
+; jdcolor.asm - colorspace conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_ycc_rgb_convert_sse2)
+
+EXTN(jconst_ycc_rgb_convert_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgb_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extrgbx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extbgrx_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxbgr_convert_sse2
+%include "jdcolext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_ycc_rgb_convert_sse2  jsimd_ycc_extxrgb_convert_sse2
+%include "jdcolext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-avx2.asm b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
new file mode 100644
index 0000000000..9515a17013
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-avx2.asm
@@ -0,0 +1,136 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_avx2)
+
+EXTN(jconst_merged_upsample_avx2):
+
+PW_F0402        times 16 dw  F_0_402
+PW_MF0228       times 16 dw -F_0_228
+PW_MF0344_F0285 times 8  dw -F_0_344, F_0_285
+PW_ONE          times 16 dw  1
+PD_ONEHALF      times 8  dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extrgbx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extrgbx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extbgrx_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extbgrx_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxbgr_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxbgr_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_avx2 \
+  jsimd_h2v1_extxrgb_merged_upsample_avx2
+%define jsimd_h2v2_merged_upsample_avx2 \
+  jsimd_h2v2_extxrgb_merged_upsample_avx2
+%include "jdmrgext-avx2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmerge-sse2.asm b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
new file mode 100644
index 0000000000..aedccc20f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmerge-sse2.asm
@@ -0,0 +1,135 @@
+;
+; jdmerge.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+
+%define SCALEBITS  16
+
+F_0_344 equ  22554              ; FIX(0.34414)
+F_0_714 equ  46802              ; FIX(0.71414)
+F_1_402 equ  91881              ; FIX(1.40200)
+F_1_772 equ 116130              ; FIX(1.77200)
+F_0_402 equ (F_1_402 - 65536)   ; FIX(1.40200) - FIX(1)
+F_0_285 equ ( 65536 - F_0_714)  ; FIX(1) - FIX(0.71414)
+F_0_228 equ (131072 - F_1_772)  ; FIX(2) - FIX(1.77200)
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_merged_upsample_sse2)
+
+EXTN(jconst_merged_upsample_sse2):
+
+PW_F0402        times 8 dw  F_0_402
+PW_MF0228       times 8 dw -F_0_228
+PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285
+PW_ONE          times 8 dw  1
+PD_ONEHALF      times 4 dd  1 << (SCALEBITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGB_RED
+%define RGB_GREEN  EXT_RGB_GREEN
+%define RGB_BLUE  EXT_RGB_BLUE
+%define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_RGBX_RED
+%define RGB_GREEN  EXT_RGBX_GREEN
+%define RGB_BLUE  EXT_RGBX_BLUE
+%define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGR_RED
+%define RGB_GREEN  EXT_BGR_GREEN
+%define RGB_BLUE  EXT_BGR_BLUE
+%define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_BGRX_RED
+%define RGB_GREEN  EXT_BGRX_GREEN
+%define RGB_BLUE  EXT_BGRX_BLUE
+%define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XBGR_RED
+%define RGB_GREEN  EXT_XBGR_GREEN
+%define RGB_BLUE  EXT_XBGR_BLUE
+%define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED  EXT_XRGB_RED
+%define RGB_GREEN  EXT_XRGB_GREEN
+%define RGB_BLUE  EXT_XRGB_BLUE
+%define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+%define jsimd_h2v1_merged_upsample_sse2 \
+  jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 \
+  jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgext-sse2.asm"
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
new file mode 100644
index 0000000000..8b264b4f03
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-avx2.asm
@@ -0,0 +1,596 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit AVX2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_avx2)
+
+EXTN(jsimd_h2v1_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    vmovdqu     ymm6, YMMWORD [rbx]     ; ymm6=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+    vmovdqu     ymm7, YMMWORD [rdx]     ; ymm7=Cr(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
+    vpcmpeqw    ymm3, ymm3, ymm3
+    vpsllw      ymm3, ymm3, 7           ; ymm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpermq      ymm6, ymm6, 0xd8        ; ymm6=Cb(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpermq      ymm7, ymm7, 0xd8        ; ymm7=Cr(01234567GHIJKLMN89ABCDEFOPQRSTUV)
+    vpunpcklbw  ymm4, ymm6, ymm1        ; ymm4=Cb(0123456789ABCDEF)=CbL
+    vpunpckhbw  ymm6, ymm6, ymm1        ; ymm6=Cb(GHIJKLMNOPQRSTUV)=CbH
+    vpunpcklbw  ymm0, ymm7, ymm1        ; ymm0=Cr(0123456789ABCDEF)=CrL
+    vpunpckhbw  ymm7, ymm7, ymm1        ; ymm7=Cr(GHIJKLMNOPQRSTUV)=CrH
+
+    vpaddw      ymm5, ymm6, ymm3
+    vpaddw      ymm2, ymm4, ymm3
+    vpaddw      ymm1, ymm7, ymm3
+    vpaddw      ymm3, ymm0, ymm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    vpaddw      ymm6, ymm5, ymm5             ; ymm6=2*CbH
+    vpaddw      ymm4, ymm2, ymm2             ; ymm4=2*CbL
+    vpaddw      ymm7, ymm1, ymm1             ; ymm7=2*CrH
+    vpaddw      ymm0, ymm3, ymm3             ; ymm0=2*CrL
+
+    vpmulhw     ymm6, ymm6, [rel PW_MF0228]  ; ymm6=(2*CbH * -FIX(0.22800))
+    vpmulhw     ymm4, ymm4, [rel PW_MF0228]  ; ymm4=(2*CbL * -FIX(0.22800))
+    vpmulhw     ymm7, ymm7, [rel PW_F0402]   ; ymm7=(2*CrH * FIX(0.40200))
+    vpmulhw     ymm0, ymm0, [rel PW_F0402]   ; ymm0=(2*CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, [rel PW_ONE]
+    vpaddw      ymm4, ymm4, [rel PW_ONE]
+    vpsraw      ymm6, ymm6, 1                ; ymm6=(CbH * -FIX(0.22800))
+    vpsraw      ymm4, ymm4, 1                ; ymm4=(CbL * -FIX(0.22800))
+    vpaddw      ymm7, ymm7, [rel PW_ONE]
+    vpaddw      ymm0, ymm0, [rel PW_ONE]
+    vpsraw      ymm7, ymm7, 1                ; ymm7=(CrH * FIX(0.40200))
+    vpsraw      ymm0, ymm0, 1                ; ymm0=(CrL * FIX(0.40200))
+
+    vpaddw      ymm6, ymm6, ymm5
+    vpaddw      ymm4, ymm4, ymm2
+    vpaddw      ymm6, ymm6, ymm5             ; ymm6=(CbH * FIX(1.77200))=(B-Y)H
+    vpaddw      ymm4, ymm4, ymm2             ; ymm4=(CbL * FIX(1.77200))=(B-Y)L
+    vpaddw      ymm7, ymm7, ymm1             ; ymm7=(CrH * FIX(1.40200))=(R-Y)H
+    vpaddw      ymm0, ymm0, ymm3             ; ymm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    vmovdqa     YMMWORD [wk(0)], ymm6        ; wk(0)=(B-Y)H
+    vmovdqa     YMMWORD [wk(1)], ymm7        ; wk(1)=(R-Y)H
+
+    vpunpckhwd  ymm6, ymm5, ymm1
+    vpunpcklwd  ymm5, ymm5, ymm1
+    vpmaddwd    ymm5, ymm5, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm6, ymm6, [rel PW_MF0344_F0285]
+    vpunpckhwd  ymm7, ymm2, ymm3
+    vpunpcklwd  ymm2, ymm2, ymm3
+    vpmaddwd    ymm2, ymm2, [rel PW_MF0344_F0285]
+    vpmaddwd    ymm7, ymm7, [rel PW_MF0344_F0285]
+
+    vpaddd      ymm5, ymm5, [rel PD_ONEHALF]
+    vpaddd      ymm6, ymm6, [rel PD_ONEHALF]
+    vpsrad      ymm5, ymm5, SCALEBITS
+    vpsrad      ymm6, ymm6, SCALEBITS
+    vpaddd      ymm2, ymm2, [rel PD_ONEHALF]
+    vpaddd      ymm7, ymm7, [rel PD_ONEHALF]
+    vpsrad      ymm2, ymm2, SCALEBITS
+    vpsrad      ymm7, ymm7, SCALEBITS
+
+    vpackssdw   ymm5, ymm5, ymm6        ; ymm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    vpackssdw   ymm2, ymm2, ymm7        ; ymm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    vpsubw      ymm5, ymm5, ymm1        ; ymm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    vpsubw      ymm2, ymm2, ymm3        ; ymm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    vmovdqa     YMMWORD [wk(2)], ymm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    vmovdqa     ymm0, YMMWORD [wk(1)]   ; ymm0=(R-Y)H
+    vmovdqa     ymm2, YMMWORD [wk(2)]   ; ymm2=(G-Y)H
+    vmovdqa     ymm4, YMMWORD [wk(0)]   ; ymm4=(B-Y)H
+
+.Yloop_1st:
+    vmovdqu     ymm7, YMMWORD [rsi]     ; ymm7=Y(0123456789ABCDEFGHIJKLMNOPQRSTUV)
+
+    vpcmpeqw    ymm6, ymm6, ymm6
+    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
+    vpand       ymm6, ymm6, ymm7        ; ymm6=Y(02468ACEGIKMOQSU)=YE
+    vpsrlw      ymm7, ymm7, BYTE_BIT    ; ymm7=Y(13579BDFHJLNPRTV)=YO
+
+    vmovdqa     ymm1, ymm0              ; ymm1=ymm0=(R-Y)(L/H)
+    vmovdqa     ymm3, ymm2              ; ymm3=ymm2=(G-Y)(L/H)
+    vmovdqa     ymm5, ymm4              ; ymm5=ymm4=(B-Y)(L/H)
+
+    vpaddw      ymm0, ymm0, ymm6        ; ymm0=((R-Y)+YE)=RE=R(02468ACEGIKMOQSU)
+    vpaddw      ymm1, ymm1, ymm7        ; ymm1=((R-Y)+YO)=RO=R(13579BDFHJLNPRTV)
+    vpackuswb   ymm0, ymm0, ymm0        ; ymm0=R(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm1, ymm1, ymm1        ; ymm1=R(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm2, ymm2, ymm6        ; ymm2=((G-Y)+YE)=GE=G(02468ACEGIKMOQSU)
+    vpaddw      ymm3, ymm3, ymm7        ; ymm3=((G-Y)+YO)=GO=G(13579BDFHJLNPRTV)
+    vpackuswb   ymm2, ymm2, ymm2        ; ymm2=G(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm3, ymm3, ymm3        ; ymm3=G(13579BDF********HJLNPRTV********)
+
+    vpaddw      ymm4, ymm4, ymm6        ; ymm4=((B-Y)+YE)=BE=B(02468ACEGIKMOQSU)
+    vpaddw      ymm5, ymm5, ymm7        ; ymm5=((B-Y)+YO)=BO=B(13579BDFHJLNPRTV)
+    vpackuswb   ymm4, ymm4, ymm4        ; ymm4=B(02468ACE********GIKMOQSU********)
+    vpackuswb   ymm5, ymm5, ymm5        ; ymm5=B(13579BDF********HJLNPRTV********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+    ; ymmH=(** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmB        ; ymmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F
+                                        ;       2G 0H 2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V)
+    vpunpcklbw  ymmD, ymmD, ymmF        ; ymmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F
+                                        ;       1H 2H 1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V)
+
+    vpsrldq     ymmH, ymmA, 2           ; ymmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 0G 1G
+                                        ;       0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U -- --)
+    vpunpckhwd  ymmG, ymmA, ymmE        ; ymmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F
+                                        ;       0O 1O 2O 0P 0Q 1Q 2Q 0R 0S 1S 2S 0T 0U 1U 2U 0V)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07
+                                        ;       0G 1G 2G 0H 0I 1I 2I 0J 0K 1K 2K 0L 0M 1M 2M 0N)
+
+    vpsrldq     ymmE, ymmE, 2           ; ymmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F 2G 0H
+                                        ;       2I 0J 2K 0L 2M 0N 2O 0P 2Q 0R 2S 0T 2U 0V -- --)
+
+    vpsrldq     ymmB, ymmD, 2           ; ymmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F 1H 2H
+                                        ;       1J 2J 1L 2L 1N 2N 1P 2P 1R 2R 1T 2T 1V 2V -- --)
+    vpunpckhwd  ymmC, ymmD, ymmH        ; ymmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F 0G 1G
+                                        ;       1P 2P 0Q 1Q 1R 2R 0S 1S 1T 2T 0U 1U 1V 2V -- --)
+    vpunpcklwd  ymmD, ymmD, ymmH        ; ymmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18
+                                        ;       1H 2H 0I 1I 1J 2J 0K 1K 1L 2L 0M 1M 1N 2N 0O 1O)
+
+    vpunpckhwd  ymmF, ymmE, ymmB        ; ymmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F 2G 0H 1H 2H
+                                        ;       2Q 0R 1R 2R 2S 0T 1T 2T 2U 0V 1V 2V -- -- -- --)
+    vpunpcklwd  ymmE, ymmE, ymmB        ; ymmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29
+                                        ;       2I 0J 1J 2J 2K 0L 1L 2L 2M 0N 1N 2N 2O 0P 1P 2P)
+
+    vpshufd     ymmH, ymmA, 0x4E        ; ymmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03
+                                        ;       0K 1K 2K 0L 0M 1M 2M 0N 0G 1G 2G 0H 0I 1I 2I 0J)
+    vpunpckldq  ymmA, ymmA, ymmD        ; ymmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 0I 1I 2I 0J 1J 2J 0K 1K)
+    vpunpckhdq  ymmD, ymmD, ymmE        ; ymmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 1N 2N 0O 1O 2O 0P 1P 2P)
+    vpunpckldq  ymmE, ymmE, ymmH        ; ymmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07
+                                        ;       2I 0J 1J 2J 0K 1K 2K 0L 2K 0L 1L 2L 0M 1M 2M 0N)
+
+    vpshufd     ymmH, ymmG, 0x4E        ; ymmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B
+                                        ;       0S 1S 2S 0T 0U 1U 2U 0V 0O 1O 2O 0P 0Q 1Q 2Q 0R)
+    vpunpckldq  ymmG, ymmG, ymmC        ; ymmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C
+                                        ;       0O 1O 2O 0P 1P 2P 0Q 1Q 0Q 1Q 2Q 0R 1R 2R 0S 1S)
+    vpunpckhdq  ymmC, ymmC, ymmF        ; ymmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F 0G 1G 2G 0H 1H 2H
+                                        ;       1T 2T 0U 1U 2U 0V 1V 2V 1V 2V -- -- -- -- -- --)
+    vpunpckldq  ymmF, ymmF, ymmH        ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 2S 0T 1T 2T 0U 1U 2U 0V)
+
+    vpunpcklqdq ymmH, ymmA, ymmE        ; ymmH=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vpunpcklqdq ymmG, ymmD, ymmG        ; ymmG=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
+                                        ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
+    vpunpcklqdq ymmC, ymmF, ymmC        ; ymmC=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    vperm2i128  ymmA, ymmH, ymmG, 0x20  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
+                                        ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    vperm2i128  ymmD, ymmC, ymmH, 0x30  ; ymmD=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
+                                        ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
+    vperm2i128  ymmF, ymmG, ymmC, 0x31  ; ymmF=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
+                                        ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_YMMWORD
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmF
+    sub         rcx, byte 2*SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st31
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD
+    jmp         short .column_st31
+.column_st31:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    vmovq       XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    vmovd       XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    vpsrldq     xmmA, xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    vmovd       eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    vpcmpeqb    ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpcmpeqb    ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%else
+    vpxor       ymm6, ymm6, ymm6        ; ymm6=XE=X(02468ACE********GIKMOQSU********)
+    vpxor       ymm7, ymm7, ymm7        ; ymm7=XO=X(13579BDF********HJLNPRTV********)
+%endif
+    ; ymmA=(00 02 04 06 08 0A 0C 0E ** 0G 0I 0K 0M 0O 0Q 0S 0U **)
+    ; ymmB=(01 03 05 07 09 0B 0D 0F ** 0H 0J 0L 0N 0P 0R 0T 0V **)
+    ; ymmC=(10 12 14 16 18 1A 1C 1E ** 1G 1I 1K 1M 1O 1Q 1S 1U **)
+    ; ymmD=(11 13 15 17 19 1B 1D 1F ** 1H 1J 1L 1N 1P 1R 1T 1V **)
+    ; ymmE=(20 22 24 26 28 2A 2C 2E ** 2G 2I 2K 2M 2O 2Q 2S 2U **)
+    ; ymmF=(21 23 25 27 29 2B 2D 2F ** 2H 2J 2L 2N 2P 2R 2T 2V **)
+    ; ymmG=(30 32 34 36 38 3A 3C 3E ** 3G 3I 3K 3M 3O 3Q 3S 3U **)
+    ; ymmH=(31 33 35 37 39 3B 3D 3F ** 3H 3J 3L 3N 3P 3R 3T 3V **)
+
+    vpunpcklbw  ymmA, ymmA, ymmC        ; ymmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
+                                        ;       0G 1G 0I 1I 0K 1K 0M 1M 0O 1O 0Q 1Q 0S 1S 0U 1U)
+    vpunpcklbw  ymmE, ymmE, ymmG        ; ymmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
+                                        ;       2G 3G 2I 3I 2K 3K 2M 3M 2O 3O 2Q 3Q 2S 3S 2U 3U)
+    vpunpcklbw  ymmB, ymmB, ymmD        ; ymmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
+                                        ;       0H 1H 0J 1J 0L 1L 0N 1N 0P 1P 0R 1R 0T 1T 0V 1V)
+    vpunpcklbw  ymmF, ymmF, ymmH        ; ymmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
+                                        ;       2H 3H 2J 3J 2L 3L 2N 3N 2P 3P 2R 3R 2T 3T 2V 3V)
+
+    vpunpckhwd  ymmC, ymmA, ymmE        ; ymmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E
+                                        ;       0O 1O 2O 3O 0Q 1Q 2Q 3Q 0S 1S 2S 3S 0U 1U 2U 3U)
+    vpunpcklwd  ymmA, ymmA, ymmE        ; ymmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36
+                                        ;       0G 1G 2G 3G 0I 1I 2I 3I 0K 1K 2K 3K 0M 1M 2M 3M)
+    vpunpckhwd  ymmG, ymmB, ymmF        ; ymmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F
+                                        ;       0P 1P 2P 3P 0R 1R 2R 3R 0T 1T 2T 3T 0V 1V 2V 3V)
+    vpunpcklwd  ymmB, ymmB, ymmF        ; ymmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37
+                                        ;       0H 1H 2H 3H 0J 1J 2J 3J 0L 1L 2L 3L 0N 1N 2N 3N)
+
+    vpunpckhdq  ymmE, ymmA, ymmB        ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vpunpckldq  ymmB, ymmA, ymmB        ; ymmB=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
+    vpunpckhdq  ymmF, ymmC, ymmG        ; ymmF=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+    vpunpckldq  ymmG, ymmC, ymmG        ; ymmG=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
+
+    vperm2i128  ymmA, ymmB, ymmE, 0x20  ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                                        ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    vperm2i128  ymmD, ymmG, ymmF, 0x20  ; ymmD=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
+                                        ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+    vperm2i128  ymmC, ymmB, ymmE, 0x31  ; ymmC=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
+                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
+    vperm2i128  ymmH, ymmG, ymmF, 0x31  ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
+                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
+
+    cmp         rcx, byte SIZEOF_YMMWORD
+    jb          short .column_st64
+
+    test        rdi, SIZEOF_YMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    vmovntdq    YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovntdq    YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovntdq    YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovntdq    YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymmC
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymmH
+.out0:
+    add         rdi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_YMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr0
+    dec         al
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_YMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_YMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st64:
+    cmp         rcx, byte SIZEOF_YMMWORD/2
+    jb          short .column_st32
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymmD
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    vmovdqa     ymmA, ymmC
+    vmovdqa     ymmD, ymmH
+    sub         rcx, byte SIZEOF_YMMWORD/2
+.column_st32:
+    cmp         rcx, byte SIZEOF_YMMWORD/4
+    jb          short .column_st16
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymmA
+    add         rdi, byte SIZEOF_YMMWORD    ; outptr
+    vmovdqa     ymmA, ymmD
+    sub         rcx, byte SIZEOF_YMMWORD/4
+.column_st16:
+    cmp         rcx, byte SIZEOF_YMMWORD/8
+    jb          short .column_st15
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    vperm2i128  ymmA, ymmA, ymmA, 1
+    sub         rcx, byte SIZEOF_YMMWORD/8
+.column_st15:
+    ; Store two pixels (8 bytes) of ymmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_YMMWORD/16
+    jb          short .column_st7
+    vmovq       MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_YMMWORD/16*4
+    sub         rcx, byte SIZEOF_YMMWORD/16
+    vpsrldq     xmmA, SIZEOF_YMMWORD/16*4
+.column_st7:
+    ; Store one pixel (4 bytes) of ymmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    vmovd       XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_avx2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_avx2)
+
+EXTN(jsimd_h2v2_merged_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_avx2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
new file mode 100644
index 0000000000..eb3ab9dbd9
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdmrgext-sse2.asm
@@ -0,0 +1,538 @@
+;
+; jdmrgext.asm - merged upsampling/color conversion (64-bit SSE2)
+;
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  3
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         ecx, r10d               ; col
+    test        rcx, rcx
+    jz          near .return
+
+    push        rcx
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
+
+    pop         rcx                     ; col
+
+.columnloop:
+
+    movdqa      xmm6, XMMWORD [rbx]     ; xmm6=Cb(0123456789ABCDEF)
+    movdqa      xmm7, XMMWORD [rdx]     ; xmm7=Cr(0123456789ABCDEF)
+
+    pxor        xmm1, xmm1              ; xmm1=(all 0's)
+    pcmpeqw     xmm3, xmm3
+    psllw       xmm3, 7                 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    movdqa      xmm4, xmm6
+    punpckhbw   xmm6, xmm1              ; xmm6=Cb(89ABCDEF)=CbH
+    punpcklbw   xmm4, xmm1              ; xmm4=Cb(01234567)=CbL
+    movdqa      xmm0, xmm7
+    punpckhbw   xmm7, xmm1              ; xmm7=Cr(89ABCDEF)=CrH
+    punpcklbw   xmm0, xmm1              ; xmm0=Cr(01234567)=CrL
+
+    paddw       xmm6, xmm3
+    paddw       xmm4, xmm3
+    paddw       xmm7, xmm3
+    paddw       xmm0, xmm3
+
+    ; (Original)
+    ; R = Y                + 1.40200 * Cr
+    ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+    ; B = Y + 1.77200 * Cb
+    ;
+    ; (This implementation)
+    ; R = Y                + 0.40200 * Cr + Cr
+    ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+    ; B = Y - 0.22800 * Cb + Cb + Cb
+
+    movdqa      xmm5, xmm6              ; xmm5=CbH
+    movdqa      xmm2, xmm4              ; xmm2=CbL
+    paddw       xmm6, xmm6              ; xmm6=2*CbH
+    paddw       xmm4, xmm4              ; xmm4=2*CbL
+    movdqa      xmm1, xmm7              ; xmm1=CrH
+    movdqa      xmm3, xmm0              ; xmm3=CrL
+    paddw       xmm7, xmm7              ; xmm7=2*CrH
+    paddw       xmm0, xmm0              ; xmm0=2*CrL
+
+    pmulhw      xmm6, [rel PW_MF0228]   ; xmm6=(2*CbH * -FIX(0.22800))
+    pmulhw      xmm4, [rel PW_MF0228]   ; xmm4=(2*CbL * -FIX(0.22800))
+    pmulhw      xmm7, [rel PW_F0402]    ; xmm7=(2*CrH * FIX(0.40200))
+    pmulhw      xmm0, [rel PW_F0402]    ; xmm0=(2*CrL * FIX(0.40200))
+
+    paddw       xmm6, [rel PW_ONE]
+    paddw       xmm4, [rel PW_ONE]
+    psraw       xmm6, 1                 ; xmm6=(CbH * -FIX(0.22800))
+    psraw       xmm4, 1                 ; xmm4=(CbL * -FIX(0.22800))
+    paddw       xmm7, [rel PW_ONE]
+    paddw       xmm0, [rel PW_ONE]
+    psraw       xmm7, 1                 ; xmm7=(CrH * FIX(0.40200))
+    psraw       xmm0, 1                 ; xmm0=(CrL * FIX(0.40200))
+
+    paddw       xmm6, xmm5
+    paddw       xmm4, xmm2
+    paddw       xmm6, xmm5              ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+    paddw       xmm4, xmm2              ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+    paddw       xmm7, xmm1              ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+    paddw       xmm0, xmm3              ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm1
+    pmaddwd     xmm5, [rel PW_MF0344_F0285]
+    pmaddwd     xmm6, [rel PW_MF0344_F0285]
+    punpcklwd   xmm2, xmm3
+    punpckhwd   xmm7, xmm3
+    pmaddwd     xmm2, [rel PW_MF0344_F0285]
+    pmaddwd     xmm7, [rel PW_MF0344_F0285]
+
+    paddd       xmm5, [rel PD_ONEHALF]
+    paddd       xmm6, [rel PD_ONEHALF]
+    psrad       xmm5, SCALEBITS
+    psrad       xmm6, SCALEBITS
+    paddd       xmm2, [rel PD_ONEHALF]
+    paddd       xmm7, [rel PD_ONEHALF]
+    psrad       xmm2, SCALEBITS
+    psrad       xmm7, SCALEBITS
+
+    packssdw    xmm5, xmm6              ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+    packssdw    xmm2, xmm7              ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+    psubw       xmm5, xmm1              ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+    psubw       xmm2, xmm3              ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+    movdqa      XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+    mov         al, 2                   ; Yctr
+    jmp         short .Yloop_1st
+
+.Yloop_2nd:
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+
+.Yloop_1st:
+    movdqa      xmm7, XMMWORD [rsi]     ; xmm7=Y(0123456789ABCDEF)
+
+    pcmpeqw     xmm6, xmm6
+    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+    pand        xmm6, xmm7              ; xmm6=Y(02468ACE)=YE
+    psrlw       xmm7, BYTE_BIT          ; xmm7=Y(13579BDF)=YO
+
+    movdqa      xmm1, xmm0              ; xmm1=xmm0=(R-Y)(L/H)
+    movdqa      xmm3, xmm2              ; xmm3=xmm2=(G-Y)(L/H)
+    movdqa      xmm5, xmm4              ; xmm5=xmm4=(B-Y)(L/H)
+
+    paddw       xmm0, xmm6              ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+    paddw       xmm1, xmm7              ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+    packuswb    xmm0, xmm0              ; xmm0=R(02468ACE********)
+    packuswb    xmm1, xmm1              ; xmm1=R(13579BDF********)
+
+    paddw       xmm2, xmm6              ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+    paddw       xmm3, xmm7              ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+    packuswb    xmm2, xmm2              ; xmm2=G(02468ACE********)
+    packuswb    xmm3, xmm3              ; xmm3=G(13579BDF********)
+
+    paddw       xmm4, xmm6              ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+    paddw       xmm5, xmm7              ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+    packuswb    xmm4, xmm4              ; xmm4=B(02468ACE********)
+    packuswb    xmm5, xmm5              ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3  ; ---------------
+
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+    punpcklbw   xmmA, xmmC        ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmB        ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+    punpcklbw   xmmD, xmmF        ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+    movdqa      xmmG, xmmA
+    movdqa      xmmH, xmmA
+    punpcklwd   xmmA, xmmE        ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+    punpckhwd   xmmG, xmmE        ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+    psrldq      xmmH, 2           ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+    psrldq      xmmE, 2           ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+    movdqa      xmmC, xmmD
+    movdqa      xmmB, xmmD
+    punpcklwd   xmmD, xmmH        ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+    punpckhwd   xmmC, xmmH        ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+    psrldq      xmmB, 2           ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+    movdqa      xmmF, xmmE
+    punpcklwd   xmmE, xmmB        ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+    punpckhwd   xmmF, xmmB        ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+    pshufd      xmmH, xmmA, 0x4E  ; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+    movdqa      xmmB, xmmE
+    punpckldq   xmmA, xmmD        ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+    punpckldq   xmmE, xmmH        ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+    punpckhdq   xmmD, xmmB        ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+    pshufd      xmmH, xmmG, 0x4E  ; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+    movdqa      xmmB, xmmF
+    punpckldq   xmmG, xmmC        ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+    punpckldq   xmmF, xmmH        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+    punpckhdq   xmmC, xmmB        ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+    punpcklqdq  xmmA, xmmE        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+    punpcklqdq  xmmD, xmmG        ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+    punpcklqdq  xmmF, xmmC        ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    lea         rcx, [rcx+rcx*2]            ; imul ecx, RGB_PIXELSIZE
+    cmp         rcx, byte 2*SIZEOF_XMMWORD
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmF
+    sub         rcx, byte 2*SIZEOF_XMMWORD
+    jmp         short .column_st15
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD
+.column_st15:
+    ; Store the lower 8 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_MMWORD
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_MMWORD
+    sub         rcx, byte SIZEOF_MMWORD
+    psrldq      xmmA, SIZEOF_MMWORD
+.column_st7:
+    ; Store the lower 4 bytes of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_DWORD
+    jb          short .column_st3
+    movd        XMM_DWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_DWORD
+    sub         rcx, byte SIZEOF_DWORD
+    psrldq      xmmA, SIZEOF_DWORD
+.column_st3:
+    ; Store the lower 2 bytes of rax to the output when it has enough
+    ; space.
+    movd        eax, xmmA
+    cmp         rcx, byte SIZEOF_WORD
+    jb          short .column_st1
+    mov         word [rdi], ax
+    add         rdi, byte SIZEOF_WORD
+    sub         rcx, byte SIZEOF_WORD
+    shr         rax, 16
+.column_st1:
+    ; Store the lower 1 byte of rax to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    mov         byte [rdi], al
+
+%else  ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+    pcmpeqb     xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pcmpeqb     xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%else
+    pxor        xmm6, xmm6              ; xmm6=XE=X(02468ACE********)
+    pxor        xmm7, xmm7              ; xmm7=XO=X(13579BDF********)
+%endif
+    ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+    ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+    ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+    ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+    punpcklbw   xmmA, xmmC  ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+    punpcklbw   xmmE, xmmG  ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+    punpcklbw   xmmB, xmmD  ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+    punpcklbw   xmmF, xmmH  ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+    movdqa      xmmC, xmmA
+    punpcklwd   xmmA, xmmE  ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+    punpckhwd   xmmC, xmmE  ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+    movdqa      xmmG, xmmB
+    punpcklwd   xmmB, xmmF  ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+    punpckhwd   xmmG, xmmF  ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+    movdqa      xmmD, xmmA
+    punpckldq   xmmA, xmmB  ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+    punpckhdq   xmmD, xmmB  ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+    movdqa      xmmH, xmmC
+    punpckldq   xmmC, xmmG  ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+    punpckhdq   xmmH, xmmG  ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+    cmp         rcx, byte SIZEOF_XMMWORD
+    jb          short .column_st32
+
+    test        rdi, SIZEOF_XMMWORD-1
+    jnz         short .out1
+    ; --(aligned)-------------------
+    movntdq     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movntdq     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movntdq     XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movntdq     XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+    jmp         short .out0
+.out1:  ; --(unaligned)-----------------
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    movdqu      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+    movdqu      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
+.out0:
+    add         rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+    sub         rcx, byte SIZEOF_XMMWORD
+    jz          near .endcolumn
+
+    add         rsi, byte SIZEOF_XMMWORD  ; inptr0
+    dec         al                        ; Yctr
+    jnz         near .Yloop_2nd
+
+    add         rbx, byte SIZEOF_XMMWORD  ; inptr1
+    add         rdx, byte SIZEOF_XMMWORD  ; inptr2
+    jmp         near .columnloop
+
+.column_st32:
+    cmp         rcx, byte SIZEOF_XMMWORD/2
+    jb          short .column_st16
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    movdqu      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    movdqa      xmmA, xmmC
+    movdqa      xmmD, xmmH
+    sub         rcx, byte SIZEOF_XMMWORD/2
+.column_st16:
+    cmp         rcx, byte SIZEOF_XMMWORD/4
+    jb          short .column_st15
+    movdqu      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+    add         rdi, byte SIZEOF_XMMWORD    ; outptr
+    movdqa      xmmA, xmmD
+    sub         rcx, byte SIZEOF_XMMWORD/4
+.column_st15:
+    ; Store two pixels (8 bytes) of xmmA to the output when it has enough
+    ; space.
+    cmp         rcx, byte SIZEOF_XMMWORD/8
+    jb          short .column_st7
+    movq        XMM_MMWORD [rdi], xmmA
+    add         rdi, byte SIZEOF_XMMWORD/8*4
+    sub         rcx, byte SIZEOF_XMMWORD/8
+    psrldq      xmmA, SIZEOF_XMMWORD/8*4
+.column_st7:
+    ; Store one pixel (4 bytes) of xmmA to the output when it has enough
+    ; space.
+    test        rcx, rcx
+    jz          short .endcolumn
+    movd        XMM_DWORD [rdi], xmmA
+
+%endif  ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+    sfence                              ; flush the write buffer
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2(JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+; r10d = JDIMENSION output_width
+; r11 = JSAMPIMAGE input_buf
+; r12d = JDIMENSION in_row_group_ctr
+; r13 = JSAMPARRAY output_buf
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         eax, r10d
+
+    mov         rdi, r11
+    mov         ecx, r12d
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rdi, r13
+    lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
+
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+
+    add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
+    add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
+
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
+    mov         rbx, rsp
+
+    push        rdi
+    push        rcx
+    push        rax
+
+    %ifdef WIN64
+    mov         r8, rcx
+    mov         r9, rdi
+    mov         rcx, rax
+    mov         rdx, rbx
+    %else
+    mov         rdx, rcx
+    mov         rcx, rdi
+    mov         rdi, rax
+    mov         rsi, rbx
+    %endif
+
+    call        EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+    pop         rax
+    pop         rcx
+    pop         rdi
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdsample-avx2.asm b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
new file mode 100644
index 0000000000..1e4979f933
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-avx2.asm
@@ -0,0 +1,696 @@
+;
+; jdsample.asm - upsampling (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_avx2)
+
+EXTN(jconst_fancy_upsample_avx2):
+
+PW_ONE   times 16 dw 1
+PW_TWO   times 16 dw 2
+PW_THREE times 16 dw 3
+PW_SEVEN times 16 dw 7
+PW_EIGHT times 16 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v1_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    push_xmm    3
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+
+    vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-1)  ; (ff -- -- -- ... -- --) LSB is ff
+
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-1)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ff) MSB is ff
+
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    vpand       ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    vpand       ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    vmovdqu     ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
+    vperm2i128  ymm6, ymm0, ymm6, 0x20
+    vpslldq     ymm6, ymm6, 15
+
+.upsample:
+    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm1=( 0  1  2 ... 29 30 31)
+
+    vperm2i128  ymm2, ymm0, ymm1, 0x20
+    vpalignr    ymm2, ymm1, ymm2, 15            ; ymm2=(--  0  1 ... 28 29 30)
+    vperm2i128  ymm4, ymm0, ymm1, 0x03
+    vpalignr    ymm3, ymm4, ymm1, 1             ; ymm3=( 1  2  3 ... 30 31 --)
+
+    vpor        ymm2, ymm2, ymm7                ; ymm2=(-1  0  1 ... 28 29 30)
+    vpor        ymm3, ymm3, ymm6                ; ymm3=( 1  2  3 ... 30 31 32)
+
+    vpsrldq     ymm7, ymm4, (SIZEOF_XMMWORD-1)  ; ymm7=(31 -- -- ... -- -- --)
+
+    vpunpckhbw  ymm4, ymm1, ymm0                ; ymm4=( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm1, ymm0                ; ymm5=( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm5, ymm4, 0x20          ; ymm1=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31          ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm2, ymm0                ; ymm5=( 7  8  9 10 11 12 13 14 23 24 25 26 27 28 29 30)
+    vpunpcklbw  ymm6, ymm2, ymm0                ; ymm6=(-1  0  1  2  3  4  5  6 15 16 17 18 19 20 21 22)
+    vperm2i128  ymm2, ymm6, ymm5, 0x20          ; ymm2=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31          ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpunpckhbw  ymm6, ymm3, ymm0                ; ymm6=( 1  2  3  4  5  6  7  8 17 18 19 20 21 22 23 24)
+    vpunpcklbw  ymm8, ymm3, ymm0                ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
+    vperm2i128  ymm3, ymm8, ymm6, 0x20          ; ymm3=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vperm2i128  ymm6, ymm8, ymm6, 0x31          ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vpmullw     ymm1, ymm1, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm2, ymm2, [rel PW_ONE]
+    vpaddw      ymm5, ymm5, [rel PW_ONE]
+    vpaddw      ymm3, ymm3, [rel PW_TWO]
+    vpaddw      ymm6, ymm6, [rel PW_TWO]
+
+    vpaddw      ymm2, ymm2, ymm1
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm2, ymm2, 2                   ; ymm2=OutLE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 2                   ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm3, ymm3, ymm1
+    vpaddw      ymm6, ymm6, ymm4
+    vpsrlw      ymm3, ymm3, 2                   ; ymm3=OutLO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm6, ymm6, 2                   ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm3, ymm3, BYTE_BIT
+    vpsllw      ymm6, ymm6, BYTE_BIT
+    vpor        ymm2, ymm2, ymm3                ; ymm2=OutL=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm6                ; ymm5=OutH=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
+
+EXTN(jsimd_h2v2_fancy_upsample_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    push_xmm    3
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
+    vpcmpeqb    xmm9, xmm9, xmm9
+    vpsrldq     xmm10, xmm9, (SIZEOF_XMMWORD-2)  ; (ffff ---- ---- ... ---- ----) LSB is ffff
+    vpslldq     xmm9, xmm9, (SIZEOF_XMMWORD-2)
+    vperm2i128  ymm9, ymm9, ymm9, 1              ; (---- ---- ... ---- ---- ffff) MSB is ffff
+
+    test        rax, SIZEOF_YMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD]  ; ymm0=row[ 0][0]
+    vmovdqu     ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD]  ; ymm1=row[-1][0]
+    vmovdqu     ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD]  ; ymm2=row[+1][0]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm3, ymm2, ymm8        ; ymm3=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm3, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm3, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
+
+    vpand       ymm1, ymm1, ymm10       ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vpand       ymm2, ymm2, ymm10       ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+
+    vmovdqa     YMMWORD [wk(0)], ymm1
+    vmovdqa     YMMWORD [wk(1)], ymm2
+
+    add         rax, byte SIZEOF_YMMWORD-1
+    and         rax, byte -SIZEOF_YMMWORD
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    vpand       ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
+    vpand       ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
+
+    vmovdqa     YMMWORD [wk(2)], ymm1   ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+    vmovdqa     YMMWORD [wk(3)], ymm2   ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    vmovdqu     ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD]  ; ymm0=row[ 0][1]
+    vmovdqu     ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD]  ; ymm1=row[-1][1]
+    vmovdqu     ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD]  ; ymm2=row[+1][1]
+
+    vpunpckhbw  ymm4, ymm0, ymm8        ; ymm4=row[ 0]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm5, ymm0, ymm8        ; ymm5=row[ 0]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm0, ymm5, ymm4, 0x20  ; ymm0=row[ 0]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm4, ymm5, ymm4, 0x31  ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm5, ymm1, ymm8        ; ymm5=row[-1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm6, ymm1, ymm8        ; ymm6=row[-1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm1, ymm6, ymm5, 0x20  ; ymm1=row[-1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm5, ymm6, ymm5, 0x31  ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpunpckhbw  ymm6, ymm2, ymm8        ; ymm6=row[+1]( 8  9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
+    vpunpcklbw  ymm7, ymm2, ymm8        ; ymm7=row[+1]( 0  1  2  3  4  5  6  7 16 17 18 19 20 21 22 23)
+    vperm2i128  ymm2, ymm7, ymm6, 0x20  ; ymm2=row[+1]( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vperm2i128  ymm6, ymm7, ymm6, 0x31  ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vpmullw     ymm0, ymm0, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+
+    vpaddw      ymm1, ymm1, ymm0        ; ymm1=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm5, ymm5, ymm4        ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+    vpaddw      ymm2, ymm2, ymm0        ; ymm2=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vpaddw      ymm6, ymm6, ymm4        ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vmovdqu     YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1  ; temporarily save
+    vmovdqu     YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5  ; the intermediate data
+    vmovdqu     YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
+    vmovdqu     YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
+
+    vperm2i128  ymm1, ymm8, ymm1, 0x20
+    vpslldq     ymm1, ymm1, 14          ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+    vperm2i128  ymm2, ymm8, ymm2, 0x20
+    vpslldq     ymm2, ymm2, 14          ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- --  0)
+
+    vmovdqa     YMMWORD [wk(2)], ymm1
+    vmovdqa     YMMWORD [wk(3)], ymm2
+
+.upsample:
+    ; -- process the upper row
+
+    vmovdqu     ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD]  ; ymm7=Int0L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD]  ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm0, ymm8, ymm7, 0x03
+    vpalignr    ymm0, ymm0, ymm7, 2     ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x20
+    vpslldq     ymm4, ymm4, 14          ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm5, ymm8, ymm7, 0x03
+    vpsrldq     ymm5, ymm5, 14          ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm6, ymm8, ymm3, 0x20
+    vpalignr    ymm6, ymm3, ymm6, 14    ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm0, ymm0, ymm4        ; ymm0=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm5, ymm5, ymm6        ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm2, ymm8, ymm3, 0x03
+    vpalignr    ymm2, ymm2, ymm3, 2     ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm4, ymm8, ymm3, 0x03
+    vpsrldq     ymm4, ymm4, 14          ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm7, 0x20
+    vpalignr    ymm1, ymm7, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(0)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm2, ymm2, YMMWORD [wk(2)]  ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(0)], ymm4
+
+    vpmullw     ymm7, ymm7, [rel PW_THREE]
+    vpmullw     ymm3, ymm3, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm5, ymm5, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_SEVEN]
+    vpaddw      ymm2, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm5, ymm5, ymm3
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out0LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm2, ymm2, ymm3
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out0LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm2, ymm2, 4           ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm0, ymm0, BYTE_BIT
+    vpsllw      ymm2, ymm2, BYTE_BIT
+    vpor        ymm1, ymm1, ymm0        ; ymm1=Out0L=( 0  1  2 ... 29 30 31)
+    vpor        ymm5, ymm5, ymm2        ; ymm5=Out0H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
+
+    ; -- process the lower row
+
+    vmovdqu     ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD]  ; ymm6=Int1L=( 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15)
+    vmovdqu     ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD]  ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
+
+    vperm2i128  ymm7, ymm8, ymm6, 0x03
+    vpalignr    ymm7, ymm7, ymm6, 2     ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x20
+    vpslldq     ymm3, ymm3, 14          ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
+
+    vperm2i128  ymm0, ymm8, ymm6, 0x03
+    vpsrldq     ymm0, ymm0, 14          ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm2, ymm8, ymm4, 0x20
+    vpalignr    ymm2, ymm4, ymm2, 14    ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vpor        ymm7, ymm7, ymm3        ; ymm7=( 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16)
+    vpor        ymm0, ymm0, ymm2        ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
+
+    vperm2i128  ymm5, ymm8, ymm4, 0x03
+    vpalignr    ymm5, ymm5, ymm4, 2     ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
+    vperm2i128  ymm3, ymm8, ymm4, 0x03
+    vpsrldq     ymm3, ymm3, 14          ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
+    vperm2i128  ymm1, ymm8, ymm6, 0x20
+    vpalignr    ymm1, ymm6, ymm1, 14    ; ymm1=(--  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+
+    vpor        ymm1, ymm1, YMMWORD [wk(1)]  ; ymm1=(-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14)
+    vpor        ymm5, ymm5, YMMWORD [wk(3)]  ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
+
+    vmovdqa     YMMWORD [wk(1)], ymm3
+
+    vpmullw     ymm6, ymm6, [rel PW_THREE]
+    vpmullw     ymm4, ymm4, [rel PW_THREE]
+    vpaddw      ymm1, ymm1, [rel PW_EIGHT]
+    vpaddw      ymm0, ymm0, [rel PW_EIGHT]
+    vpaddw      ymm7, ymm7, [rel PW_SEVEN]
+    vpaddw      ymm5, ymm5, [rel PW_SEVEN]
+
+    vpaddw      ymm1, ymm1, ymm6
+    vpaddw      ymm0, ymm0, ymm4
+    vpsrlw      ymm1, ymm1, 4           ; ymm1=Out1LE=( 0  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30)
+    vpsrlw      ymm0, ymm0, 4           ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
+    vpaddw      ymm7, ymm7, ymm6
+    vpaddw      ymm5, ymm5, ymm4
+    vpsrlw      ymm7, ymm7, 4           ; ymm7=Out1LO=( 1  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31)
+    vpsrlw      ymm5, ymm5, 4           ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
+
+    vpsllw      ymm7, ymm7, BYTE_BIT
+    vpsllw      ymm5, ymm5, BYTE_BIT
+    vpor        ymm1, ymm1, ymm7        ; ymm1=Out1L=( 0  1  2 ... 29 30 31)
+    vpor        ymm0, ymm0, ymm5        ; ymm0=Out1H=(32 33 34 ... 61 62 63)
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
+
+    sub         rax, byte SIZEOF_YMMWORD
+    add         rcx, byte 1*SIZEOF_YMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_YMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_YMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_YMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop_xmm     3
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
+
+EXTN(jsimd_h2v1_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          near .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         short .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD    ; inptr
+    add         rdi, byte 2*SIZEOF_YMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
+
+EXTN(jsimd_h2v2_upsample_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (SIZEOF_YMMWORD-1)
+    and         rdx, -SIZEOF_YMMWORD
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    cmp         rax, byte SIZEOF_YMMWORD
+    ja          short .above_16
+
+    vmovdqu     xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    vpunpckhbw  xmm1, xmm0, xmm0
+    vpunpcklbw  xmm0, xmm0, xmm0
+
+    vmovdqu     XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    vmovdqu     XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    vmovdqu     XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    jmp         near .nextrow
+
+.above_16:
+    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
+
+    vpermq      ymm0, ymm0, 0xd8
+    vpunpckhbw  ymm1, ymm0, ymm0
+    vpunpcklbw  ymm0, ymm0, ymm0
+
+    vmovdqu     YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
+    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
+    vmovdqu     YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
+
+    sub         rax, byte 2*SIZEOF_YMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte SIZEOF_YMMWORD  ; inptr
+    add         rbx, 2*SIZEOF_YMMWORD     ; outptr0
+    add         rdi, 2*SIZEOF_YMMWORD     ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    vzeroupper
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jdsample-sse2.asm b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
new file mode 100644
index 0000000000..38dbceec26
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jdsample-sse2.asm
@@ -0,0 +1,665 @@
+;
+; jdsample.asm - upsampling (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fancy_upsample_sse2)
+
+EXTN(jconst_fancy_upsample_sse2):
+
+PW_ONE   times 8 dw 1
+PW_TWO   times 8 dw 2
+PW_THREE times 8 dw 3
+PW_SEVEN times 8 dw 7
+PW_EIGHT times 8 dw 8
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
+;
+; The upsampling algorithm is linear interpolation between pixel centers,
+; also known as a "triangle filter".  This is a good compromise between
+; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
+; of the way between input pixel centers.
+;
+; GLOBAL(void)
+; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v1_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+.skip:
+    pxor        xmm0, xmm0              ; xmm0=(all 0's)
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)
+    pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    pcmpeqb     xmm6, xmm6
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+    pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    jmp         short .upsample
+
+.columnloop:
+    movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+    pslldq      xmm6, (SIZEOF_XMMWORD-1)
+
+.upsample:
+    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+    movdqa      xmm2, xmm1
+    movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
+    pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
+    psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
+
+    por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
+    por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
+
+    movdqa      xmm7, xmm1
+    psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
+
+    movdqa      xmm4, xmm1
+    punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm2
+    punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
+    punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
+    movdqa      xmm6, xmm3
+    punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
+    punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
+
+    pmullw      xmm1, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm2, [rel PW_ONE]
+    paddw       xmm5, [rel PW_ONE]
+    paddw       xmm3, [rel PW_TWO]
+    paddw       xmm6, [rel PW_TWO]
+
+    paddw       xmm2, xmm1
+    paddw       xmm5, xmm4
+    psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
+    paddw       xmm3, xmm1
+    paddw       xmm6, xmm4
+    psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm3, BYTE_BIT
+    psllw       xmm6, BYTE_BIT
+    por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        eax, eax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rax
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          near .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
+; Again a triangle filter; see comments for h2v1 case, above.
+;
+; GLOBAL(void)
+; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
+;                                JDIMENSION downsampled_width,
+;                                JSAMPARRAY input_data,
+;                                JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION downsampled_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  4
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
+
+EXTN(jsimd_h2v2_fancy_upsample_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+    push        rbx
+
+    mov         eax, r11d               ; colctr
+    test        rax, rax
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rax                     ; colctr
+    push        rcx
+    push        rdi
+    push        rsi
+
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+
+    test        rax, SIZEOF_XMMWORD-1
+    jz          short .skip
+    push        rdx
+    mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
+    mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
+    mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
+    pop         rdx
+.skip:
+    ; -- process the first column block
+
+    movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
+    movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
+    movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    pcmpeqb     xmm7, xmm7
+    psrldq      xmm7, (SIZEOF_XMMWORD-2)
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
+
+    pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
+    pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
+
+    movdqa      XMMWORD [wk(0)], xmm1
+    movdqa      XMMWORD [wk(1)], xmm2
+
+    add         rax, byte SIZEOF_XMMWORD-1
+    and         rax, byte -SIZEOF_XMMWORD
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          short .columnloop
+
+.columnloop_last:
+    ; -- process the last column block
+
+    pcmpeqb     xmm1, xmm1
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)
+    movdqa      xmm2, xmm1
+
+    pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+    pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
+    movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
+
+    jmp         near .upsample
+
+.columnloop:
+    ; -- process the next column block
+
+    movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
+    movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
+
+    pxor        xmm3, xmm3              ; xmm3=(all 0's)
+    movdqa      xmm4, xmm0
+    punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm5, xmm1
+    punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
+    movdqa      xmm6, xmm2
+    punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
+    punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
+
+    pmullw      xmm0, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+
+    paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
+    paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
+    paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
+
+    movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
+    movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
+
+    pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
+    pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
+
+    movdqa      XMMWORD [wk(2)], xmm1
+    movdqa      XMMWORD [wk(3)], xmm2
+
+.upsample:
+    ; -- process the upper row
+
+    movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
+    movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
+    movdqa      xmm5, xmm7
+    movdqa      xmm6, xmm3
+    psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
+
+    por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
+    por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm2, xmm3
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm4, xmm3
+    psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(0)], xmm4
+
+    pmullw      xmm7, [rel PW_THREE]
+    pmullw      xmm3, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm5, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_SEVEN]
+    paddw       xmm2, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm7
+    paddw       xmm5, xmm3
+    psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm0, xmm7
+    paddw       xmm2, xmm3
+    psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm0, BYTE_BIT
+    psllw       xmm2, BYTE_BIT
+    por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
+    por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
+
+    ; -- process the lower row
+
+    movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
+    movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
+    movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
+    psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
+    pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
+    movdqa      xmm0, xmm6
+    movdqa      xmm2, xmm4
+    psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
+    pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
+
+    por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
+    por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
+
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm4
+    pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
+    psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
+    movdqa      xmm3, xmm4
+    psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
+
+    por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
+    por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
+
+    movdqa      XMMWORD [wk(1)], xmm3
+
+    pmullw      xmm6, [rel PW_THREE]
+    pmullw      xmm4, [rel PW_THREE]
+    paddw       xmm1, [rel PW_EIGHT]
+    paddw       xmm0, [rel PW_EIGHT]
+    paddw       xmm7, [rel PW_SEVEN]
+    paddw       xmm5, [rel PW_SEVEN]
+
+    paddw       xmm1, xmm6
+    paddw       xmm0, xmm4
+    psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
+    psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
+    paddw       xmm7, xmm6
+    paddw       xmm5, xmm4
+    psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
+    psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
+
+    psllw       xmm7, BYTE_BIT
+    psllw       xmm5, BYTE_BIT
+    por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
+    por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
+
+    sub         rax, byte SIZEOF_XMMWORD
+    add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
+    add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
+    add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
+    add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
+    cmp         rax, byte SIZEOF_XMMWORD
+    ja          near .columnloop
+    test        rax, rax
+    jnz         near .columnloop_last
+
+    pop         rsi
+    pop         rdi
+    pop         rcx
+    pop         rax
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
+
+EXTN(jsimd_h2v1_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          short .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
+    mov         rax, rdx                ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
+    dec         rcx                        ; rowctr
+    jg          short .rowloop
+
+.return:
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
+; It's still a box filter.
+;
+; GLOBAL(void)
+; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
+;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
+;
+
+; r10 = int max_v_samp_factor
+; r11d = JDIMENSION output_width
+; r12 = JSAMPARRAY input_data
+; r13 = JSAMPARRAY *output_data_ptr
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
+
+EXTN(jsimd_h2v2_upsample_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    mov         edx, r11d
+    add         rdx, byte (2*SIZEOF_XMMWORD)-1
+    and         rdx, byte -(2*SIZEOF_XMMWORD)
+    jz          near .return
+
+    mov         rcx, r10                ; rowctr
+    test        rcx, rcx
+    jz          near .return
+
+    mov         rsi, r12                ; input_data
+    mov         rdi, r13
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
+.rowloop:
+    push        rdi
+    push        rsi
+
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
+    mov         rax, rdx                               ; colctr
+.columnloop:
+
+    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
+
+    movdqa      xmm1, xmm0
+    punpcklbw   xmm0, xmm0
+    punpckhbw   xmm1, xmm1
+
+    movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
+    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
+    movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
+
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm2
+    punpckhbw   xmm3, xmm3
+
+    movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
+    movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
+    movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
+
+    sub         rax, byte 2*SIZEOF_XMMWORD
+    jz          short .nextrow
+
+    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
+    add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
+    add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
+    jmp         short .columnloop
+
+.nextrow:
+    pop         rsi
+    pop         rdi
+
+    add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
+    add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
+    sub         rcx, byte 2                  ; rowctr
+    jg          near .rowloop
+
+.return:
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctflt-sse.asm b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
new file mode 100644
index 0000000000..ef2796649b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctflt-sse.asm
@@ -0,0 +1,355 @@
+;
+; jfdctflt.asm - floating-point FDCT (64-bit SSE)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the forward DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_float_sse)
+
+EXTN(jconst_fdct_float_sse):
+
+PD_0_382 times 4 dd 0.382683432365089771728460
+PD_0_707 times 4 dd 0.707106781186547524400844
+PD_0_541 times 4 dd 0.541196100146196984399723
+PD_1_306 times 4 dd 1.306562964876376527856643
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_float_sse(FAST_FLOAT *data)
+;
+
+; r10 = FAST_FLOAT *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_float_sse)
+
+EXTN(jsimd_fdct_float_sse):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.rowloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+    ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
+    unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+    ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
+    unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .rowloop
+
+    ; ---- Pass 2: process columns.
+
+    mov         rdx, r10                ; (FAST_FLOAT *)
+    mov         rcx, DCTSIZE/4
+.columnloop:
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+    ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+
+    movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
+    unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
+    movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
+    unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
+    unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
+
+    movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
+
+    ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+    ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+
+    movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
+    movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
+
+    movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
+    unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
+    movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
+    unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
+    unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
+
+    movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
+    unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
+    movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
+    unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
+    unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
+
+    movaps      xmm0, xmm7
+    movaps      xmm5, xmm6
+    subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
+    subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
+    addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
+
+    movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
+    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
+    movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
+    movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
+    unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
+    movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
+    unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
+
+    movaps      xmm2, xmm7
+    movaps      xmm3, xmm4
+    addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
+    addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
+    subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movaps      xmm1, xmm5
+    movaps      xmm6, xmm0
+    subps       xmm5, xmm7              ; xmm5=tmp13
+    subps       xmm0, xmm4              ; xmm0=tmp12
+    addps       xmm1, xmm7              ; xmm1=tmp10
+    addps       xmm6, xmm4              ; xmm6=tmp11
+
+    addps       xmm0, xmm5
+    mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
+
+    movaps      xmm7, xmm1
+    movaps      xmm4, xmm5
+    subps       xmm1, xmm6              ; xmm1=data4
+    subps       xmm5, xmm0              ; xmm5=data6
+    addps       xmm7, xmm6              ; xmm7=data0
+    addps       xmm4, xmm0              ; xmm4=data2
+
+    movaps      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    ; -- Odd part
+
+    movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
+    movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
+
+    addps       xmm2, xmm3              ; xmm2=tmp10
+    addps       xmm3, xmm6              ; xmm3=tmp11
+    addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
+
+    mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
+
+    movaps      xmm1, xmm2              ; xmm1=tmp10
+    subps       xmm2, xmm6
+    mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
+    mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+    mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+    addps       xmm1, xmm2              ; xmm1=z2
+    addps       xmm6, xmm2              ; xmm6=z4
+
+    movaps      xmm5, xmm0
+    subps       xmm0, xmm3              ; xmm0=z13
+    addps       xmm5, xmm3              ; xmm5=z11
+
+    movaps      xmm7, xmm0
+    movaps      xmm4, xmm5
+    subps       xmm0, xmm1              ; xmm0=data3
+    subps       xmm5, xmm6              ; xmm5=data7
+    addps       xmm7, xmm1              ; xmm7=data5
+    addps       xmm4, xmm6              ; xmm4=data1
+
+    movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
+    movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
+
+    add         rdx, byte 4*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         near .columnloop
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
new file mode 100644
index 0000000000..2e1bfe6e8c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctfst-sse2.asm
@@ -0,0 +1,389 @@
+;
+; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the forward DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+
+%if CONST_BITS == 8
+F_0_382 equ  98  ; FIX(0.382683433)
+F_0_541 equ 139  ; FIX(0.541196100)
+F_0_707 equ 181  ; FIX(0.707106781)
+F_1_306 equ 334  ; FIX(1.306562965)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
+F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_ifast_sse2)
+
+EXTN(jconst_fdct_ifast_sse2):
+
+PW_F0707 times 8 dw F_0_707 << CONST_SHIFT
+PW_F0382 times 8 dw F_0_382 << CONST_SHIFT
+PW_F0541 times 8 dw F_0_541 << CONST_SHIFT
+PW_F1306 times 8 dw F_1_306 << CONST_SHIFT
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_ifast_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_ifast_sse2)
+
+EXTN(jsimd_fdct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    psubw       xmm3, xmm1              ; xmm3=tmp13
+    psubw       xmm6, xmm7              ; xmm6=tmp12
+    paddw       xmm4, xmm1              ; xmm4=tmp10
+    paddw       xmm0, xmm7              ; xmm0=tmp11
+
+    paddw       xmm6, xmm3
+    psllw       xmm6, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm6, [rel PW_F0707]    ; xmm6=z1
+
+    movdqa      xmm1, xmm4
+    movdqa      xmm7, xmm3
+    psubw       xmm4, xmm0              ; xmm4=data4
+    psubw       xmm3, xmm6              ; xmm3=data6
+    paddw       xmm1, xmm0              ; xmm1=data0
+    paddw       xmm7, xmm6              ; xmm7=data2
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=data4
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=data6
+
+    ; -- Odd part
+
+    paddw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm5, xmm0              ; xmm5=tmp11
+    paddw       xmm0, xmm6              ; xmm0=tmp12, xmm6=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z3
+
+    movdqa      xmm4, xmm2              ; xmm4=tmp10
+    psubw       xmm2, xmm0
+    pmulhw      xmm2, [rel PW_F0382]    ; xmm2=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm0, [rel PW_F1306]    ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm2              ; xmm4=z2
+    paddw       xmm0, xmm2              ; xmm0=z4
+
+    movdqa      xmm3, xmm6
+    psubw       xmm6, xmm5              ; xmm6=z13
+    paddw       xmm3, xmm5              ; xmm3=z11
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm5, xmm3
+    psubw       xmm6, xmm4              ; xmm6=data3
+    psubw       xmm3, xmm0              ; xmm3=data7
+    paddw       xmm2, xmm4              ; xmm2=data5
+    paddw       xmm5, xmm0              ; xmm5=data1
+
+    ; ---- Pass 2: process columns.
+
+    ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
+    ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm4, xmm5              ; xmm4=(40 41 50 51 60 61 70 71)
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm6              ; xmm7=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm0, xmm6              ; xmm0=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=col4
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=col6
+
+    ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
+    ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm2              ; xmm5=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm7, xmm2              ; xmm7=(44 45 54 55 64 65 74 75)
+    movdqa      xmm0, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm3              ; xmm6=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm0, xmm3              ; xmm0=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm2, xmm5              ; transpose coefficients(phase 2)
+    punpckldq   xmm5, xmm6              ; xmm5=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm2, xmm6              ; xmm2=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm0              ; xmm7=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm3, xmm0              ; xmm3=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm6              ; xmm1=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm2, xmm6              ; xmm2=(20 21 22 23 30 31 32 33)
+    movdqa      xmm7, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm7, xmm0              ; xmm7=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm5              ; xmm1=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm6, xmm5              ; xmm6=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm0, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm3              ; xmm7=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm0, xmm3              ; xmm0=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm3, xmm1
+    psubw       xmm6, xmm7              ; xmm6=data1-data6=tmp6
+    psubw       xmm1, xmm0              ; xmm1=data0-data7=tmp7
+    paddw       xmm5, xmm7              ; xmm5=data1+data6=tmp1
+    paddw       xmm3, xmm0              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
+    movdqa      xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
+
+    movdqa      xmm6, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm7              ; xmm2=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm6, xmm7              ; xmm6=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm1, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm1, xmm0              ; xmm1=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm0, xmm2
+    paddw       xmm6, xmm4              ; xmm6=data3+data4=tmp3
+    paddw       xmm2, xmm1              ; xmm2=data2+data5=tmp2
+    psubw       xmm7, xmm4              ; xmm7=data3-data4=tmp4
+    psubw       xmm0, xmm1              ; xmm0=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm1, xmm5
+    psubw       xmm3, xmm6              ; xmm3=tmp13
+    psubw       xmm5, xmm2              ; xmm5=tmp12
+    paddw       xmm4, xmm6              ; xmm4=tmp10
+    paddw       xmm1, xmm2              ; xmm1=tmp11
+
+    paddw       xmm5, xmm3
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F0707]    ; xmm5=z1
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm2, xmm3
+    psubw       xmm4, xmm1              ; xmm4=data4
+    psubw       xmm3, xmm5              ; xmm3=data6
+    paddw       xmm6, xmm1              ; xmm6=data0
+    paddw       xmm2, xmm5              ; xmm2=data2
+
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    ; -- Odd part
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    paddw       xmm7, xmm0              ; xmm7=tmp10
+    paddw       xmm0, xmm1              ; xmm0=tmp11
+    paddw       xmm1, xmm5              ; xmm1=tmp12, xmm5=tmp7
+
+    psllw       xmm7, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm0, [rel PW_F0707]    ; xmm0=z3
+
+    movdqa      xmm4, xmm7              ; xmm4=tmp10
+    psubw       xmm7, xmm1
+    pmulhw      xmm7, [rel PW_F0382]    ; xmm7=z5
+    pmulhw      xmm4, [rel PW_F0541]    ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
+    pmulhw      xmm1, [rel PW_F1306]    ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
+    paddw       xmm4, xmm7              ; xmm4=z2
+    paddw       xmm1, xmm7              ; xmm1=z4
+
+    movdqa      xmm3, xmm5
+    psubw       xmm5, xmm0              ; xmm5=z13
+    paddw       xmm3, xmm0              ; xmm3=z11
+
+    movdqa      xmm6, xmm5
+    movdqa      xmm2, xmm3
+    psubw       xmm5, xmm4              ; xmm5=data3
+    psubw       xmm3, xmm1              ; xmm3=data7
+    paddw       xmm6, xmm4              ; xmm6=data5
+    paddw       xmm2, xmm1              ; xmm2=data1
+
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-avx2.asm b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
new file mode 100644
index 0000000000..e56258b48a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-avx2.asm
@@ -0,0 +1,320 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpckhwd  %6, %1, %2
+    vpunpcklwd  %7, %3, %4
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 01 11 02 12 03 13  40 50 41 51 42 52 43 53)
+    ; %6=(04 14 05 15 06 16 07 17  44 54 45 55 46 56 47 57)
+    ; %7=(20 30 21 31 22 32 23 33  60 70 61 71 62 72 63 73)
+    ; %8=(24 34 25 35 26 36 27 37  64 74 65 75 66 76 67 77)
+
+    vpunpckldq  %1, %5, %7
+    vpunpckhdq  %2, %5, %7
+    vpunpckldq  %3, %6, %8
+    vpunpckhdq  %4, %6, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %2=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %3=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %4=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpermq      %1, %1, 0x8D
+    vpermq      %2, %2, 0x8D
+    vpermq      %3, %3, 0xD8
+    vpermq      %4, %4, 0xD8
+    ; transpose coefficients(phase 3)
+    ; %1=(01 11 21 31 41 51 61 71  00 10 20 30 40 50 60 70)
+    ; %2=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %3=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %4=(06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+; %9:    Pass (1 or 2)
+
+%macro dodct 9
+    vpsubw      %5, %1, %4              ; %5=data1_0-data6_7=tmp6_7
+    vpaddw      %6, %1, %4              ; %6=data1_0+data6_7=tmp1_0
+    vpaddw      %7, %2, %3              ; %7=data3_2+data4_5=tmp3_2
+    vpsubw      %8, %2, %3              ; %8=data3_2-data4_5=tmp4_5
+
+    ; -- Even part
+
+    vperm2i128  %6, %6, %6, 0x01        ; %6=tmp0_1
+    vpaddw      %1, %6, %7              ; %1=tmp0_1+tmp3_2=tmp10_11
+    vpsubw      %6, %6, %7              ; %6=tmp0_1-tmp3_2=tmp13_12
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=tmp11_10
+    vpsignw     %1, %1, [rel PW_1_NEG1]  ; %1=tmp10_neg11
+    vpaddw      %7, %7, %1              ; %7=(tmp10+tmp11)_(tmp10-tmp11)
+%if %9 == 1
+    vpsllw      %1, %7, PASS1_BITS      ; %1=data0_4
+%else
+    vpaddw      %7, %7, [rel PW_DESCALE_P2X]
+    vpsraw      %1, %7, PASS1_BITS      ; %1=data0_4
+%endif
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    vperm2i128  %7, %6, %6, 0x01        ; %7=tmp12_13
+    vpunpcklwd  %2, %6, %7
+    vpunpckhwd  %6, %6, %7
+    vpmaddwd    %2, %2, [rel PW_F130_F054_MF130_F054]  ; %2=data2_6L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=data2_6H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %6, %6, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %6, %6, DESCALE_P %+ %9
+
+    vpackssdw   %3, %2, %6              ; %6=data2_6
+
+    ; -- Odd part
+
+    vpaddw      %7, %8, %5              ; %7=tmp4_5+tmp6_7=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %2, %7, %7, 0x01        ; %2=z4_3
+    vpunpcklwd  %6, %7, %2
+    vpunpckhwd  %7, %7, %2
+    vpmaddwd    %6, %6, [rel PW_MF078_F117_F078_F117]  ; %6=z3_4L
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    vperm2i128  %4, %5, %5, 0x01        ; %4=tmp7_6
+    vpunpcklwd  %2, %8, %4
+    vpunpckhwd  %4, %8, %4
+    vpmaddwd    %2, %2, [rel PW_MF060_MF089_MF050_MF256]  ; %2=tmp4_5L
+    vpmaddwd    %4, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %4=tmp4_5H
+
+    vpaddd      %2, %2, %6              ; %2=data7_5L
+    vpaddd      %4, %4, %7              ; %4=data7_5H
+
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %2, %2, DESCALE_P %+ %9
+    vpsrad      %4, %4, DESCALE_P %+ %9
+
+    vpackssdw   %4, %2, %4              ; %4=data7_5
+
+    vperm2i128  %2, %8, %8, 0x01        ; %2=tmp5_4
+    vpunpcklwd  %8, %5, %2
+    vpunpckhwd  %5, %5, %2
+    vpmaddwd    %8, %8, [rel PW_F050_MF256_F060_MF089]  ; %8=tmp6_7L
+    vpmaddwd    %5, %5, [rel PW_F050_MF256_F060_MF089]  ; %5=tmp6_7H
+
+    vpaddd      %8, %8, %6              ; %8=data3_1L
+    vpaddd      %5, %5, %7              ; %5=data3_1H
+
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %9]
+    vpaddd      %5, %5, [rel PD_DESCALE_P %+ %9]
+    vpsrad      %8, %8, DESCALE_P %+ %9
+    vpsrad      %5, %5, DESCALE_P %+ %9
+
+    vpackssdw   %2, %8, %5              ; %2=data3_1
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_avx2)
+
+EXTN(jconst_fdct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_F050_MF256_F060_MF089   times 4  dw  (F_3_072 - F_2_562), -F_2_562
+                           times 4  dw  (F_1_501 - F_0_899), -F_0_899
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X             times 16 dw  1 << (PASS1_BITS - 1)
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_avx2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_avx2)
+
+EXTN(jsimd_fdct_islow_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)]
+    ; ymm4=(00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17)
+    ; ymm5=(20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37)
+    ; ymm6=(40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57)
+    ; ymm7=(60 61 62 63 64 65 66 67  70 71 72 73 74 75 76 77)
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20
+    vperm2i128  ymm1, ymm4, ymm6, 0x31
+    vperm2i128  ymm2, ymm5, ymm7, 0x20
+    vperm2i128  ymm3, ymm5, ymm7, 0x31
+    ; ymm0=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; ymm1=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; ymm2=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; ymm3=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, 1
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm3=data7_5
+
+    ; ---- Pass 2: process columns.
+
+    vperm2i128  ymm4, ymm1, ymm3, 0x20  ; ymm4=data3_7
+    vperm2i128  ymm1, ymm1, ymm3, 0x31  ; ymm1=data1_5
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+
+    dodct       ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, 2
+    ; ymm0=data0_4, ymm1=data3_1, ymm2=data2_6, ymm4=data7_5
+
+    vperm2i128 ymm3, ymm0, ymm1, 0x30   ; ymm3=data0_1
+    vperm2i128 ymm5, ymm2, ymm1, 0x20   ; ymm5=data2_3
+    vperm2i128 ymm6, ymm0, ymm4, 0x31   ; ymm6=data4_5
+    vperm2i128 ymm7, ymm2, ymm4, 0x21   ; ymm7=data6_7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm3
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm5
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm6
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm7
+
+    vzeroupper
+    uncollect_args 1
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jfdctint-sse2.asm b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
new file mode 100644
index 0000000000..ec1f383ccb
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jfdctint-sse2.asm
@@ -0,0 +1,619 @@
+;
+; jfdctint.asm - accurate integer FDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; forward DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jfdctint.c; see the jfdctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_fdct_islow_sse2)
+
+EXTN(jconst_fdct_islow_sse2):
+
+PW_F130_F054   times 4 dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4 dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4 dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4 dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4 dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4 dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4 dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4 dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4 dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4 dd  1 << (DESCALE_P2 - 1)
+PW_DESCALE_P2X times 8 dw  1 << (PASS1_BITS - 1)
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform the forward DCT on one block of samples.
+;
+; GLOBAL(void)
+; jsimd_fdct_islow_sse2(DCTELEM *data)
+;
+
+; r10 = DCTELEM *data
+
+%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
+%define WK_NUM  6
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_fdct_islow_sse2)
+
+EXTN(jsimd_fdct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 1
+
+    ; ---- Pass 1: process rows.
+
+    mov         rdx, r10                ; (DCTELEM *)
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
+    ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm1              ; xmm0=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm4, xmm1              ; xmm4=(04 14 05 15 06 16 07 17)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm3              ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm5, xmm3              ; xmm5=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
+
+    ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
+    ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
+    movdqa      XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm2, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm7              ; xmm2=(44 54 45 55 46 56 47 57)
+    movdqa      xmm5, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm3              ; xmm1=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm5, xmm3              ; xmm5=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm1              ; xmm6=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm7, xmm1              ; xmm7=(42 52 62 72 43 53 63 73)
+    movdqa      xmm3, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm3, xmm5              ; xmm3=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=(42 52 62 72 43 53 63 73)
+    movdqa      XMMWORD [wk(3)], xmm2   ; wk(3)=(44 54 64 74 45 55 65 75)
+
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm1              ; xmm0=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm7, xmm1              ; xmm7=(02 12 22 32 03 13 23 33)
+    movdqa      xmm2, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm5              ; xmm4=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm2, xmm5              ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm1, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=(00 10 20 30 40 50 60 70)=data0
+    punpckhqdq  xmm1, xmm6              ; xmm1=(01 11 21 31 41 51 61 71)=data1
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm3              ; xmm2=(06 16 26 36 46 56 66 76)=data6
+    punpckhqdq  xmm5, xmm3              ; xmm5=(07 17 27 37 47 57 67 77)=data7
+
+    movdqa      xmm6, xmm1
+    movdqa      xmm3, xmm0
+    psubw       xmm1, xmm2              ; xmm1=data1-data6=tmp6
+    psubw       xmm0, xmm5              ; xmm0=data0-data7=tmp7
+    paddw       xmm6, xmm2              ; xmm6=data1+data6=tmp1
+    paddw       xmm3, xmm5              ; xmm3=data0+data7=tmp0
+
+    movdqa      xmm2, XMMWORD [wk(2)]   ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, XMMWORD [wk(3)]   ; xmm5=(44 54 64 74 45 55 65 75)
+    movdqa      XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm2              ; xmm7=(02 12 22 32 42 52 62 72)=data2
+    punpckhqdq  xmm1, xmm2              ; xmm1=(03 13 23 33 43 53 63 73)=data3
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm5              ; xmm4=(04 14 24 34 44 54 64 74)=data4
+    punpckhqdq  xmm0, xmm5              ; xmm0=(05 15 25 35 45 55 65 75)=data5
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm5, xmm7
+    paddw       xmm1, xmm4              ; xmm1=data3+data4=tmp3
+    paddw       xmm7, xmm0              ; xmm7=data2+data5=tmp2
+    psubw       xmm2, xmm4              ; xmm2=data3-data4=tmp4
+    psubw       xmm5, xmm0              ; xmm5=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm0, xmm6
+    paddw       xmm3, xmm1              ; xmm3=tmp10
+    paddw       xmm6, xmm7              ; xmm6=tmp11
+    psubw       xmm4, xmm1              ; xmm4=tmp13
+    psubw       xmm0, xmm7              ; xmm0=tmp12
+
+    movdqa      xmm1, xmm3
+    paddw       xmm3, xmm6              ; xmm3=tmp10+tmp11
+    psubw       xmm1, xmm6              ; xmm1=tmp10-tmp11
+
+    psllw       xmm3, PASS1_BITS        ; xmm3=data0
+    psllw       xmm1, PASS1_BITS        ; xmm1=data4
+
+    movdqa      XMMWORD [wk(2)], xmm3   ; wk(2)=data0
+    movdqa      XMMWORD [wk(3)], xmm1   ; wk(3)=data4
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm7, xmm4              ; xmm4=tmp13
+    movdqa      xmm6, xmm4
+    punpcklwd   xmm7, xmm0              ; xmm0=tmp12
+    punpckhwd   xmm6, xmm0
+    movdqa      xmm4, xmm7
+    movdqa      xmm0, xmm6
+    pmaddwd     xmm7, [rel PW_F130_F054]   ; xmm7=data2L
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=data2H
+    pmaddwd     xmm4, [rel PW_F054_MF130]  ; xmm4=data6L
+    pmaddwd     xmm0, [rel PW_F054_MF130]  ; xmm0=data6H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm6, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm0, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm7, xmm6              ; xmm7=data2
+    packssdw    xmm4, xmm0              ; xmm4=data6
+
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=data2
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=data6
+
+    ; -- Odd part
+
+    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=tmp6
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp7
+
+    movdqa      xmm6, xmm2              ; xmm2=tmp4
+    movdqa      xmm0, xmm5              ; xmm5=tmp5
+    paddw       xmm6, xmm3              ; xmm6=z3
+    paddw       xmm0, xmm1              ; xmm0=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm7, xmm6
+    movdqa      xmm4, xmm6
+    punpcklwd   xmm7, xmm0
+    punpckhwd   xmm4, xmm0
+    movdqa      xmm6, xmm7
+    movdqa      xmm0, xmm4
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3L
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3H
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4L
+    pmaddwd     xmm0, [rel PW_F117_F078]   ; xmm0=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm7, xmm2
+    movdqa      xmm4, xmm2
+    punpcklwd   xmm7, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm2, xmm7
+    movdqa      xmm1, xmm4
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp4L
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4H
+    pmaddwd     xmm2, [rel PW_MF089_F060]   ; xmm2=tmp7L
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp7H
+
+    paddd       xmm7, XMMWORD [wk(0)]   ; xmm7=data7L
+    paddd       xmm4, XMMWORD [wk(1)]   ; xmm4=data7H
+    paddd       xmm2, xmm6              ; xmm2=data1L
+    paddd       xmm1, xmm0              ; xmm1=data1H
+
+    paddd       xmm7, [rel PD_DESCALE_P1]
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm2, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+
+    packssdw    xmm7, xmm4              ; xmm7=data7
+    packssdw    xmm2, xmm1              ; xmm2=data1
+
+    movdqa      xmm4, xmm5
+    movdqa      xmm1, xmm5
+    punpcklwd   xmm4, xmm3
+    punpckhwd   xmm1, xmm3
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm1
+    pmaddwd     xmm4, [rel PW_MF050_MF256]  ; xmm4=tmp5L
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5H
+    pmaddwd     xmm5, [rel PW_MF256_F050]   ; xmm5=tmp6L
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6H
+
+    paddd       xmm4, xmm6              ; xmm4=data5L
+    paddd       xmm1, xmm0              ; xmm1=data5H
+    paddd       xmm5, XMMWORD [wk(0)]   ; xmm5=data3L
+    paddd       xmm3, XMMWORD [wk(1)]   ; xmm3=data3H
+
+    paddd       xmm4, [rel PD_DESCALE_P1]
+    paddd       xmm1, [rel PD_DESCALE_P1]
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm1, DESCALE_P1
+    paddd       xmm5, [rel PD_DESCALE_P1]
+    paddd       xmm3, [rel PD_DESCALE_P1]
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+
+    packssdw    xmm4, xmm1              ; xmm4=data5
+    packssdw    xmm5, xmm3              ; xmm5=data3
+
+    ; ---- Pass 2: process columns.
+
+    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=col0
+    movdqa      xmm0, XMMWORD [wk(4)]   ; xmm0=col2
+
+    ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72)
+    ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm2              ; xmm6=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm1, xmm2              ; xmm1=(40 41 50 51 60 61 70 71)
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(02 03 12 13 22 23 32 33)
+    punpckhwd   xmm3, xmm5              ; xmm3=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm2, XMMWORD [wk(3)]   ; xmm2=col4
+    movdqa      xmm5, XMMWORD [wk(5)]   ; xmm5=col6
+
+    ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76)
+    ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=(02 03 12 13 22 23 32 33)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(42 43 52 53 62 63 72 73)
+
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm4              ; xmm2=(04 05 14 15 24 25 34 35)
+    punpckhwd   xmm0, xmm4              ; xmm0=(44 45 54 55 64 65 74 75)
+    movdqa      xmm3, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm7              ; xmm5=(06 07 16 17 26 27 36 37)
+    punpckhwd   xmm3, xmm7              ; xmm3=(46 47 56 57 66 67 76 77)
+
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm5              ; xmm2=(04 05 06 07 14 15 16 17)
+    punpckhdq   xmm4, xmm5              ; xmm4=(24 25 26 27 34 35 36 37)
+    movdqa      xmm7, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(44 45 46 47 54 55 56 57)
+    punpckhdq   xmm7, xmm3              ; xmm7=(64 65 66 67 74 75 76 77)
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=(02 03 12 13 22 23 32 33)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53 62 63 72 73)
+    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=(24 25 26 27 34 35 36 37)
+    movdqa      XMMWORD [wk(3)], xmm0   ; wk(3)=(44 45 46 47 54 55 56 57)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm5              ; xmm6=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm4, xmm5              ; xmm4=(20 21 22 23 30 31 32 33)
+    movdqa      xmm0, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm3              ; xmm1=(40 41 42 43 50 51 52 53)
+    punpckhdq   xmm0, xmm3              ; xmm0=(60 61 62 63 70 71 72 73)
+
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm2              ; xmm6=(00 01 02 03 04 05 06 07)=data0
+    punpckhqdq  xmm5, xmm2              ; xmm5=(10 11 12 13 14 15 16 17)=data1
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm7              ; xmm0=(60 61 62 63 64 65 66 67)=data6
+    punpckhqdq  xmm3, xmm7              ; xmm3=(70 71 72 73 74 75 76 77)=data7
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm7, xmm6
+    psubw       xmm5, xmm0              ; xmm5=data1-data6=tmp6
+    psubw       xmm6, xmm3              ; xmm6=data0-data7=tmp7
+    paddw       xmm2, xmm0              ; xmm2=data1+data6=tmp1
+    paddw       xmm7, xmm3              ; xmm7=data0+data7=tmp0
+
+    movdqa      xmm0, XMMWORD [wk(2)]   ; xmm0=(24 25 26 27 34 35 36 37)
+    movdqa      xmm3, XMMWORD [wk(3)]   ; xmm3=(44 45 46 47 54 55 56 57)
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=tmp6
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
+
+    movdqa      xmm5, xmm4              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm4, xmm0              ; xmm4=(20 21 22 23 24 25 26 27)=data2
+    punpckhqdq  xmm5, xmm0              ; xmm5=(30 31 32 33 34 35 36 37)=data3
+    movdqa      xmm6, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm3              ; xmm1=(40 41 42 43 44 45 46 47)=data4
+    punpckhqdq  xmm6, xmm3              ; xmm6=(50 51 52 53 54 55 56 57)=data5
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm3, xmm4
+    paddw       xmm5, xmm1              ; xmm5=data3+data4=tmp3
+    paddw       xmm4, xmm6              ; xmm4=data2+data5=tmp2
+    psubw       xmm0, xmm1              ; xmm0=data3-data4=tmp4
+    psubw       xmm3, xmm6              ; xmm3=data2-data5=tmp5
+
+    ; -- Even part
+
+    movdqa      xmm1, xmm7
+    movdqa      xmm6, xmm2
+    paddw       xmm7, xmm5              ; xmm7=tmp10
+    paddw       xmm2, xmm4              ; xmm2=tmp11
+    psubw       xmm1, xmm5              ; xmm1=tmp13
+    psubw       xmm6, xmm4              ; xmm6=tmp12
+
+    movdqa      xmm5, xmm7
+    paddw       xmm7, xmm2              ; xmm7=tmp10+tmp11
+    psubw       xmm5, xmm2              ; xmm5=tmp10-tmp11
+
+    paddw       xmm7, [rel PW_DESCALE_P2X]
+    paddw       xmm5, [rel PW_DESCALE_P2X]
+    psraw       xmm7, PASS1_BITS        ; xmm7=data0
+    psraw       xmm5, PASS1_BITS        ; xmm5=data4
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm7
+    movdqa      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm5
+
+    ; (Original)
+    ; z1 = (tmp12 + tmp13) * 0.541196100;
+    ; data2 = z1 + tmp13 * 0.765366865;
+    ; data6 = z1 + tmp12 * -1.847759065;
+    ;
+    ; (This implementation)
+    ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
+    ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
+
+    movdqa      xmm4, xmm1              ; xmm1=tmp13
+    movdqa      xmm2, xmm1
+    punpcklwd   xmm4, xmm6              ; xmm6=tmp12
+    punpckhwd   xmm2, xmm6
+    movdqa      xmm1, xmm4
+    movdqa      xmm6, xmm2
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=data2L
+    pmaddwd     xmm2, [rel PW_F130_F054]   ; xmm2=data2H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=data6L
+    pmaddwd     xmm6, [rel PW_F054_MF130]  ; xmm6=data6H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm2, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm6, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm6, DESCALE_P2
+
+    packssdw    xmm4, xmm2              ; xmm4=data2
+    packssdw    xmm1, xmm6              ; xmm1=data6
+
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm1
+
+    ; -- Odd part
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp6
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
+
+    movdqa      xmm2, xmm0              ; xmm0=tmp4
+    movdqa      xmm6, xmm3              ; xmm3=tmp5
+    paddw       xmm2, xmm7              ; xmm2=z3
+    paddw       xmm6, xmm5              ; xmm6=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm1, xmm2
+    punpcklwd   xmm4, xmm6
+    punpckhwd   xmm1, xmm6
+    movdqa      xmm2, xmm4
+    movdqa      xmm6, xmm1
+    pmaddwd     xmm4, [rel PW_MF078_F117]  ; xmm4=z3L
+    pmaddwd     xmm1, [rel PW_MF078_F117]  ; xmm1=z3H
+    pmaddwd     xmm2, [rel PW_F117_F078]   ; xmm2=z4L
+    pmaddwd     xmm6, [rel PW_F117_F078]   ; xmm6=z4H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=z3L
+    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=z3H
+
+    ; (Original)
+    ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
+    ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
+    ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
+    ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
+    ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
+    ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
+    ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
+    ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
+    ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm1, xmm0
+    punpcklwd   xmm4, xmm5
+    punpckhwd   xmm1, xmm5
+    movdqa      xmm0, xmm4
+    movdqa      xmm5, xmm1
+    pmaddwd     xmm4, [rel PW_MF060_MF089]  ; xmm4=tmp4L
+    pmaddwd     xmm1, [rel PW_MF060_MF089]  ; xmm1=tmp4H
+    pmaddwd     xmm0, [rel PW_MF089_F060]   ; xmm0=tmp7L
+    pmaddwd     xmm5, [rel PW_MF089_F060]   ; xmm5=tmp7H
+
+    paddd       xmm4,  XMMWORD [wk(0)]  ; xmm4=data7L
+    paddd       xmm1,  XMMWORD [wk(1)]  ; xmm1=data7H
+    paddd       xmm0, xmm2              ; xmm0=data1L
+    paddd       xmm5, xmm6              ; xmm5=data1H
+
+    paddd       xmm4, [rel PD_DESCALE_P2]
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm0, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+
+    packssdw    xmm4, xmm1              ; xmm4=data7
+    packssdw    xmm0, xmm5              ; xmm0=data1
+
+    movdqa      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm4
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm0
+
+    movdqa      xmm1, xmm3
+    movdqa      xmm5, xmm3
+    punpcklwd   xmm1, xmm7
+    punpckhwd   xmm5, xmm7
+    movdqa      xmm3, xmm1
+    movdqa      xmm7, xmm5
+    pmaddwd     xmm1, [rel PW_MF050_MF256]  ; xmm1=tmp5L
+    pmaddwd     xmm5, [rel PW_MF050_MF256]  ; xmm5=tmp5H
+    pmaddwd     xmm3, [rel PW_MF256_F050]   ; xmm3=tmp6L
+    pmaddwd     xmm7, [rel PW_MF256_F050]   ; xmm7=tmp6H
+
+    paddd       xmm1, xmm2              ; xmm1=data5L
+    paddd       xmm5, xmm6              ; xmm5=data5H
+    paddd       xmm3, XMMWORD [wk(0)]   ; xmm3=data3L
+    paddd       xmm7, XMMWORD [wk(1)]   ; xmm7=data3H
+
+    paddd       xmm1, [rel PD_DESCALE_P2]
+    paddd       xmm5, [rel PD_DESCALE_P2]
+    psrad       xmm1, DESCALE_P2
+    psrad       xmm5, DESCALE_P2
+    paddd       xmm3, [rel PD_DESCALE_P2]
+    paddd       xmm7, [rel PD_DESCALE_P2]
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm1, xmm5              ; xmm1=data5
+    packssdw    xmm3, xmm7              ; xmm3=data3
+
+    movdqa      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm3
+
+    uncollect_args 1
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctflt-sse2.asm b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
new file mode 100644
index 0000000000..60bf961896
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctflt-sse2.asm
@@ -0,0 +1,482 @@
+;
+; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a floating-point implementation of the inverse DCT
+; (Discrete Cosine Transform). The following code is based directly on
+; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+    shufps      %1, %2, 0x44
+%endmacro
+
+%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+    shufps      %1, %2, 0xEE
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_float_sse2)
+
+EXTN(jconst_idct_float_sse2):
+
+PD_1_414        times 4  dd  1.414213562373095048801689
+PD_1_847        times 4  dd  1.847759065022573512256366
+PD_1_082        times 4  dd  1.082392200292393968799446
+PD_M2_613       times 4  dd -2.613125929752753055713286
+PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
+                                        ; FAST_FLOAT workspace[DCTSIZE2]
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
+
+EXTN(jsimd_idct_float_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [workspace]
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input, store into work array.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rcx, DCTSIZE/4          ; ctr
+.columnloop:
+%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm2
+    por         xmm3, xmm4
+    por         xmm5, xmm6
+    por         xmm1, xmm3
+    por         xmm5, xmm7
+    por         xmm1, xmm5
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm1, xmm0
+    movaps      xmm2, xmm0
+    movaps      xmm3, xmm0
+
+    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
+    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
+    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
+    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    jmp         near .nextcolumn
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
+    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
+
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+
+    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
+    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
+    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
+    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
+
+    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
+    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
+    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
+    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
+    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
+
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
+    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
+    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
+    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
+    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
+    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
+    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
+    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
+    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
+
+    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
+    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm0, xmm7
+    movaps      xmm3, xmm5
+    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
+    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
+    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
+    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
+
+    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
+    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
+    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
+    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
+    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
+    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
+
+    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
+    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
+    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
+    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
+    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
+    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
+
+    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
+    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
+    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
+
+    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
+    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
+    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
+    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
+    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
+    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
+
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
+    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
+    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
+
+.nextcolumn:
+    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
+    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
+    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
+    dec         rcx                                    ; ctr
+    jnz         near .columnloop
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+    mov         rcx, DCTSIZE/4          ; ctr
+.rowloop:
+
+    ; -- Even part
+
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm0
+    movaps      xmm5, xmm1
+    subps       xmm0, xmm2              ; xmm0=tmp11
+    subps       xmm1, xmm3
+    addps       xmm4, xmm2              ; xmm4=tmp10
+    addps       xmm5, xmm3              ; xmm5=tmp13
+
+    mulps       xmm1, [rel PD_1_414]
+    subps       xmm1, xmm5              ; xmm1=tmp12
+
+    movaps      xmm6, xmm4
+    movaps      xmm7, xmm0
+    subps       xmm4, xmm5              ; xmm4=tmp3
+    subps       xmm0, xmm1              ; xmm0=tmp2
+    addps       xmm6, xmm5              ; xmm6=tmp0
+    addps       xmm7, xmm1              ; xmm7=tmp1
+
+    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
+    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
+
+    ; -- Odd part
+
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
+
+    movaps      xmm4, xmm2
+    movaps      xmm0, xmm5
+    addps       xmm2, xmm1              ; xmm2=z11
+    addps       xmm5, xmm3              ; xmm5=z13
+    subps       xmm4, xmm1              ; xmm4=z12
+    subps       xmm0, xmm3              ; xmm0=z10
+
+    movaps      xmm1, xmm2
+    subps       xmm2, xmm5
+    addps       xmm1, xmm5              ; xmm1=tmp7
+
+    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
+
+    movaps      xmm3, xmm0
+    addps       xmm0, xmm4
+    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
+    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
+    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
+    addps       xmm3, xmm0              ; xmm3=tmp12
+    subps       xmm4, xmm0              ; xmm4=tmp10
+
+    ; -- Final output stage
+
+    subps       xmm3, xmm1              ; xmm3=tmp6
+    movaps      xmm5, xmm6
+    movaps      xmm0, xmm7
+    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
+    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
+    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
+    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
+    subps       xmm2, xmm3              ; xmm2=tmp5
+
+    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm3, xmm3
+    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
+    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
+    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
+    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
+
+    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
+    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
+    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
+    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
+
+    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
+    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
+
+    addps       xmm4, xmm2              ; xmm4=tmp4
+    movaps      xmm7, xmm1
+    movaps      xmm5, xmm3
+    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
+    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
+    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
+    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
+
+    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
+    pcmpeqd     xmm4, xmm4
+    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
+
+    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
+    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
+    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
+    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
+
+    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
+    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
+    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
+    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
+    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
+    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
+    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
+    paddb       xmm6, xmm2
+    paddb       xmm1, xmm2
+
+    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+
+    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
+    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+
+    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
+
+    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
+    add         rdi, byte 4*SIZEOF_JSAMPROW
+    dec         rcx                            ; ctr
+    jnz         near .rowloop
+
+    pop         rbx
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctfst-sse2.asm b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
new file mode 100644
index 0000000000..cb97fdfbb2
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctfst-sse2.asm
@@ -0,0 +1,491 @@
+;
+; jidctfst.asm - fast integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a fast, not so accurate integer implementation of
+; the inverse DCT (Discrete Cosine Transform). The following code is
+; based directly on the IJG's original jidctfst.c; see the jidctfst.c
+; for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  8  ; 14 is also OK.
+%define PASS1_BITS  2
+
+%if IFAST_SCALE_BITS != PASS1_BITS
+%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
+%endif
+
+%if CONST_BITS == 8
+F_1_082 equ 277              ; FIX(1.082392200)
+F_1_414 equ 362              ; FIX(1.414213562)
+F_1_847 equ 473              ; FIX(1.847759065)
+F_2_613 equ 669              ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - 256)  ; FIX(2.613125930) - FIX(1)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS)  ; FIX(1.082392200)
+F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS)  ; FIX(1.414213562)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS)  ; FIX(2.613125930)
+F_1_613 equ (F_2_613 - (1 << CONST_BITS))         ; FIX(2.613125930) - FIX(1)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
+; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
+
+%define PRE_MULTIPLY_SCALE_BITS  2
+%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_ifast_sse2)
+
+EXTN(jconst_idct_ifast_sse2):
+
+PW_F1414       times 8  dw  F_1_414 << CONST_SHIFT
+PW_F1847       times 8  dw  F_1_847 << CONST_SHIFT
+PW_MF1613      times 8  dw -F_1_613 << CONST_SHIFT
+PW_F1082       times 8  dw  F_1_082 << CONST_SHIFT
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block,
+;                      JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_ifast_sse2)
+
+EXTN(jsimd_idct_ifast_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm7, xmm0              ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0              ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm7, xmm7              ; xmm7=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm6, xmm0, 0x00        ; xmm6=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm2, xmm0, 0x55        ; xmm2=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm5, xmm0, 0xAA        ; xmm5=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm0, xmm0, 0xFF        ; xmm0=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm1, xmm7, 0x00        ; xmm1=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm4, xmm7, 0x55        ; xmm4=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm3, xmm7, 0xAA        ; xmm3=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm7, xmm7, 0xFF        ; xmm7=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=col3
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm1
+    psubw       xmm0, xmm2              ; xmm0=tmp11
+    psubw       xmm1, xmm3
+    paddw       xmm4, xmm2              ; xmm4=tmp10
+    paddw       xmm5, xmm3              ; xmm5=tmp13
+
+    psllw       xmm1, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm1, [rel PW_F1414]
+    psubw       xmm1, xmm5              ; xmm1=tmp12
+
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm0
+    psubw       xmm4, xmm5              ; xmm4=tmp3
+    psubw       xmm0, xmm1              ; xmm0=tmp2
+    paddw       xmm6, xmm5              ; xmm6=tmp0
+    paddw       xmm7, xmm1              ; xmm7=tmp1
+
+    movdqa      XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
+    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
+
+    ; -- Odd part
+
+    movdqa      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)]
+
+    movdqa      xmm4, xmm2
+    movdqa      xmm0, xmm5
+    psubw       xmm2, xmm1              ; xmm2=z12
+    psubw       xmm5, xmm3              ; xmm5=z10
+    paddw       xmm4, xmm1              ; xmm4=z11
+    paddw       xmm0, xmm3              ; xmm0=z13
+
+    movdqa      xmm1, xmm5              ; xmm1=z10(unscaled)
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm3, xmm4
+    psubw       xmm4, xmm0
+    paddw       xmm3, xmm0              ; xmm3=tmp7
+
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm4, [rel PW_F1414]    ; xmm4=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm0, xmm5
+    paddw       xmm5, xmm2
+    pmulhw      xmm5, [rel PW_F1847]    ; xmm5=z5
+    pmulhw      xmm0, [rel PW_MF1613]
+    pmulhw      xmm2, [rel PW_F1082]
+    psubw       xmm0, xmm1
+    psubw       xmm2, xmm5              ; xmm2=tmp10
+    paddw       xmm0, xmm5              ; xmm0=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm0, xmm3              ; xmm0=tmp6
+    movdqa      xmm1, xmm6
+    movdqa      xmm5, xmm7
+    paddw       xmm6, xmm3              ; xmm6=data0=(00 01 02 03 04 05 06 07)
+    paddw       xmm7, xmm0              ; xmm7=data1=(10 11 12 13 14 15 16 17)
+    psubw       xmm1, xmm3              ; xmm1=data7=(70 71 72 73 74 75 76 77)
+    psubw       xmm5, xmm0              ; xmm5=data6=(60 61 62 63 64 65 66 67)
+    psubw       xmm4, xmm0              ; xmm4=tmp5
+
+    movdqa      xmm3, xmm6              ; transpose coefficients(phase 1)
+    punpcklwd   xmm6, xmm7              ; xmm6=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm3, xmm7              ; xmm3=(04 14 05 15 06 16 07 17)
+    movdqa      xmm0, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm1              ; xmm5=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm0, xmm1              ; xmm0=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
+    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
+
+    paddw       xmm2, xmm4              ; xmm2=tmp4
+    movdqa      xmm5, xmm7
+    movdqa      xmm0, xmm1
+    paddw       xmm7, xmm4              ; xmm7=data2=(20 21 22 23 24 25 26 27)
+    paddw       xmm1, xmm2              ; xmm1=data4=(40 41 42 43 44 45 46 47)
+    psubw       xmm5, xmm4              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+    psubw       xmm0, xmm2              ; xmm0=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm4, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm0              ; xmm7=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm0              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm2, xmm1              ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm5              ; xmm1=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm2, xmm5              ; xmm2=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm0, xmm3              ; transpose coefficients(phase 2)
+    punpckldq   xmm3, xmm4              ; xmm3=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm0, xmm4              ; xmm0=(06 16 26 36 07 17 27 37)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7              ; xmm6=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm5, xmm7              ; xmm5=(02 12 22 32 03 13 23 33)
+
+    movdqa      xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, xmm1              ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm4              ; xmm1=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm3, xmm4              ; xmm3=(42 52 62 72 43 53 63 73)
+    movdqa      xmm0, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm7              ; xmm2=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm0, xmm7              ; xmm0=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm4, xmm6              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm6, xmm1              ; xmm6=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm4, xmm1              ; xmm4=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm7, xmm5              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm5, xmm3              ; xmm5=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm7, xmm3              ; xmm7=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
+    movdqa      xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=col1
+    movdqa      XMMWORD [wk(1)], xmm7   ; wk(1)=col3
+
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm7, xmm3              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm3, xmm0              ; xmm3=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm7, xmm0              ; xmm7=col7=(07 17 27 37 47 57 67 77)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
+
+    movdqa      xmm2, xmm6
+    movdqa      xmm0, xmm5
+    psubw       xmm6, xmm1              ; xmm6=tmp11
+    psubw       xmm5, xmm3
+    paddw       xmm2, xmm1              ; xmm2=tmp10
+    paddw       xmm0, xmm3              ; xmm0=tmp13
+
+    psllw       xmm5, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm5, [rel PW_F1414]
+    psubw       xmm5, xmm0              ; xmm5=tmp12
+
+    movdqa      xmm1, xmm2
+    movdqa      xmm3, xmm6
+    psubw       xmm2, xmm0              ; xmm2=tmp3
+    psubw       xmm6, xmm5              ; xmm6=tmp2
+    paddw       xmm1, xmm0              ; xmm1=tmp0
+    paddw       xmm3, xmm5              ; xmm3=tmp1
+
+    movdqa      xmm0, XMMWORD [wk(0)]   ; xmm0=col1
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=col3
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
+
+    ; -- Odd part
+
+    ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
+
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm4
+    psubw       xmm0, xmm7              ; xmm0=z12
+    psubw       xmm4, xmm5              ; xmm4=z10
+    paddw       xmm2, xmm7              ; xmm2=z11
+    paddw       xmm6, xmm5              ; xmm6=z13
+
+    movdqa      xmm7, xmm4              ; xmm7=z10(unscaled)
+    psllw       xmm0, PRE_MULTIPLY_SCALE_BITS
+    psllw       xmm4, PRE_MULTIPLY_SCALE_BITS
+
+    movdqa      xmm5, xmm2
+    psubw       xmm2, xmm6
+    paddw       xmm5, xmm6              ; xmm5=tmp7
+
+    psllw       xmm2, PRE_MULTIPLY_SCALE_BITS
+    pmulhw      xmm2, [rel PW_F1414]    ; xmm2=tmp11
+
+    ; To avoid overflow...
+    ;
+    ; (Original)
+    ; tmp12 = -2.613125930 * z10 + z5;
+    ;
+    ; (This implementation)
+    ; tmp12 = (-1.613125930 - 1) * z10 + z5;
+    ;       = -1.613125930 * z10 - z10 + z5;
+
+    movdqa      xmm6, xmm4
+    paddw       xmm4, xmm0
+    pmulhw      xmm4, [rel PW_F1847]    ; xmm4=z5
+    pmulhw      xmm6, [rel PW_MF1613]
+    pmulhw      xmm0, [rel PW_F1082]
+    psubw       xmm6, xmm7
+    psubw       xmm0, xmm4              ; xmm0=tmp10
+    paddw       xmm6, xmm4              ; xmm6=tmp12
+
+    ; -- Final output stage
+
+    psubw       xmm6, xmm5              ; xmm6=tmp6
+    movdqa      xmm7, xmm1
+    movdqa      xmm4, xmm3
+    paddw       xmm1, xmm5              ; xmm1=data0=(00 10 20 30 40 50 60 70)
+    paddw       xmm3, xmm6              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    psraw       xmm1, (PASS1_BITS+3)    ; descale
+    psraw       xmm3, (PASS1_BITS+3)    ; descale
+    psubw       xmm7, xmm5              ; xmm7=data7=(07 17 27 37 47 57 67 77)
+    psubw       xmm4, xmm6              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psubw       xmm2, xmm6              ; xmm2=tmp5
+
+    packsswb    xmm1, xmm4        ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm7        ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
+    movdqa      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
+
+    paddw       xmm0, xmm2              ; xmm0=tmp4
+    movdqa      xmm4, xmm5
+    movdqa      xmm7, xmm6
+    paddw       xmm5, xmm2              ; xmm5=data2=(02 12 22 32 42 52 62 72)
+    paddw       xmm6, xmm0              ; xmm6=data4=(04 14 24 34 44 54 64 74)
+    psraw       xmm5, (PASS1_BITS+3)    ; descale
+    psraw       xmm6, (PASS1_BITS+3)    ; descale
+    psubw       xmm4, xmm2              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+    psubw       xmm7, xmm0              ; xmm7=data3=(03 13 23 33 43 53 63 73)
+    psraw       xmm4, (PASS1_BITS+3)    ; descale
+    psraw       xmm7, (PASS1_BITS+3)    ; descale
+
+    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
+
+    packsswb    xmm5, xmm6        ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm7, xmm4        ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm1, xmm2
+    paddb       xmm3, xmm2
+    paddb       xmm5, xmm2
+    paddb       xmm7, xmm2
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 1)
+    punpcklbw   xmm1, xmm3        ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm3        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm6, xmm5        ; transpose coefficients(phase 1)
+    punpcklbw   xmm5, xmm7        ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm6, xmm7        ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm1        ; transpose coefficients(phase 2)
+    punpcklwd   xmm1, xmm5        ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm5        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 2)
+    punpcklwd   xmm6, xmm0        ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm2, xmm0        ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm3, xmm1        ; transpose coefficients(phase 3)
+    punpckldq   xmm1, xmm6        ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm3, xmm6        ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm7, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm2        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm7, xmm2        ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm5, xmm1, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm3, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctint-avx2.asm b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
new file mode 100644
index 0000000000..ca7e317f6e
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-avx2.asm
@@ -0,0 +1,418 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit inverse matrix transpose using AVX2 instructions
+; %1-%4: Input/output registers
+; %5-%8: Temp registers
+
+%macro dotranspose 8
+    ; %5=(00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71)
+    ; %6=(03 13 23 33 43 53 63 73  02 12 22 32 42 52 62 72)
+    ; %7=(04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75)
+    ; %8=(07 17 27 37 47 57 67 77  06 16 26 36 46 56 66 76)
+
+    vpermq      %5, %1, 0xD8
+    vpermq      %6, %2, 0x72
+    vpermq      %7, %3, 0xD8
+    vpermq      %8, %4, 0x72
+    ; transpose coefficients(phase 1)
+    ; %5=(00 10 20 30 01 11 21 31  40 50 60 70 41 51 61 71)
+    ; %6=(02 12 22 32 03 13 23 33  42 52 62 72 43 53 63 73)
+    ; %7=(04 14 24 34 05 15 25 35  44 54 64 74 45 55 65 75)
+    ; %8=(06 16 26 36 07 17 27 37  46 56 66 76 47 57 67 77)
+
+    vpunpcklwd  %1, %5, %6
+    vpunpckhwd  %2, %5, %6
+    vpunpcklwd  %3, %7, %8
+    vpunpckhwd  %4, %7, %8
+    ; transpose coefficients(phase 2)
+    ; %1=(00 02 10 12 20 22 30 32  40 42 50 52 60 62 70 72)
+    ; %2=(01 03 11 13 21 23 31 33  41 43 51 53 61 63 71 73)
+    ; %3=(04 06 14 16 24 26 34 36  44 46 54 56 64 66 74 76)
+    ; %4=(05 07 15 17 25 27 35 37  45 47 55 57 65 67 75 77)
+
+    vpunpcklwd  %5, %1, %2
+    vpunpcklwd  %6, %3, %4
+    vpunpckhwd  %7, %1, %2
+    vpunpckhwd  %8, %3, %4
+    ; transpose coefficients(phase 3)
+    ; %5=(00 01 02 03 10 11 12 13  40 41 42 43 50 51 52 53)
+    ; %6=(04 05 06 07 14 15 16 17  44 45 46 47 54 55 56 57)
+    ; %7=(20 21 22 23 30 31 32 33  60 61 62 63 70 71 72 73)
+    ; %8=(24 25 26 27 34 35 36 37  64 65 66 67 74 75 76 77)
+
+    vpunpcklqdq %1, %5, %6
+    vpunpckhqdq %2, %5, %6
+    vpunpcklqdq %3, %7, %8
+    vpunpckhqdq %4, %7, %8
+    ; transpose coefficients(phase 4)
+    ; %1=(00 01 02 03 04 05 06 07  40 41 42 43 44 45 46 47)
+    ; %2=(10 11 12 13 14 15 16 17  50 51 52 53 54 55 56 57)
+    ; %3=(20 21 22 23 24 25 26 27  60 61 62 63 64 65 66 67)
+    ; %4=(30 31 32 33 34 35 36 37  70 71 72 73 74 75 76 77)
+%endmacro
+
+; --------------------------------------------------------------------------
+; In-place 8x8x16-bit accurate integer inverse DCT using AVX2 instructions
+; %1-%4:  Input/output registers
+; %5-%12: Temp registers
+; %9:     Pass (1 or 2)
+
+%macro dodct 13
+    ; -- Even part
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    vperm2i128  %6, %3, %3, 0x01        ; %6=in6_2
+    vpunpcklwd  %5, %3, %6              ; %5=in26_62L
+    vpunpckhwd  %6, %3, %6              ; %6=in26_62H
+    vpmaddwd    %5, %5, [rel PW_F130_F054_MF130_F054]  ; %5=tmp3_2L
+    vpmaddwd    %6, %6, [rel PW_F130_F054_MF130_F054]  ; %6=tmp3_2H
+
+    vperm2i128  %7, %1, %1, 0x01        ; %7=in4_0
+    vpsignw     %1, %1, [rel PW_1_NEG1]
+    vpaddw      %7, %7, %1              ; %7=(in0+in4)_(in0-in4)
+
+    vpxor       %1, %1, %1
+    vpunpcklwd  %8, %1, %7              ; %8=tmp0_1L
+    vpunpckhwd  %1, %1, %7              ; %1=tmp0_1H
+    vpsrad      %8, %8, (16-CONST_BITS)  ; vpsrad %8,16 & vpslld %8,CONST_BITS
+    vpsrad      %1, %1, (16-CONST_BITS)  ; vpsrad %1,16 & vpslld %1,CONST_BITS
+
+    vpsubd      %11, %8, %5             ; %11=tmp0_1L-tmp3_2L=tmp13_12L
+    vpaddd      %9, %8, %5              ; %9=tmp0_1L+tmp3_2L=tmp10_11L
+    vpsubd      %12, %1, %6             ; %12=tmp0_1H-tmp3_2H=tmp13_12H
+    vpaddd      %10, %1, %6             ; %10=tmp0_1H+tmp3_2H=tmp10_11H
+
+    ; -- Odd part
+
+    vpaddw      %1, %4, %2              ; %1=in7_5+in3_1=z3_4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    vperm2i128  %8, %1, %1, 0x01        ; %8=z4_3
+    vpunpcklwd  %7, %1, %8              ; %7=z34_43L
+    vpunpckhwd  %8, %1, %8              ; %8=z34_43H
+    vpmaddwd    %7, %7, [rel PW_MF078_F117_F078_F117]  ; %7=z3_4L
+    vpmaddwd    %8, %8, [rel PW_MF078_F117_F078_F117]  ; %8=z3_4H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    vperm2i128  %2, %2, %2, 0x01        ; %2=in1_3
+    vpunpcklwd  %3, %4, %2              ; %3=in71_53L
+    vpunpckhwd  %4, %4, %2              ; %4=in71_53H
+
+    vpmaddwd    %5, %3, [rel PW_MF060_MF089_MF050_MF256]  ; %5=tmp0_1L
+    vpmaddwd    %6, %4, [rel PW_MF060_MF089_MF050_MF256]  ; %6=tmp0_1H
+    vpaddd      %5, %5, %7              ; %5=tmp0_1L+z3_4L=tmp0_1L
+    vpaddd      %6, %6, %8              ; %6=tmp0_1H+z3_4H=tmp0_1H
+
+    vpmaddwd    %3, %3, [rel PW_MF089_F060_MF256_F050]  ; %3=tmp3_2L
+    vpmaddwd    %4, %4, [rel PW_MF089_F060_MF256_F050]  ; %4=tmp3_2H
+    vperm2i128  %7, %7, %7, 0x01        ; %7=z4_3L
+    vperm2i128  %8, %8, %8, 0x01        ; %8=z4_3H
+    vpaddd      %7, %3, %7              ; %7=tmp3_2L+z4_3L=tmp3_2L
+    vpaddd      %8, %4, %8              ; %8=tmp3_2H+z4_3H=tmp3_2H
+
+    ; -- Final output stage
+
+    vpaddd      %1, %9, %7              ; %1=tmp10_11L+tmp3_2L=data0_1L
+    vpaddd      %2, %10, %8             ; %2=tmp10_11H+tmp3_2H=data0_1H
+    vpaddd      %1, %1, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %2, %2, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %1, %1, DESCALE_P %+ %13
+    vpsrad      %2, %2, DESCALE_P %+ %13
+    vpackssdw   %1, %1, %2              ; %1=data0_1
+
+    vpsubd      %3, %9, %7              ; %3=tmp10_11L-tmp3_2L=data7_6L
+    vpsubd      %4, %10, %8             ; %4=tmp10_11H-tmp3_2H=data7_6H
+    vpaddd      %3, %3, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %4, %4, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %3, %3, DESCALE_P %+ %13
+    vpsrad      %4, %4, DESCALE_P %+ %13
+    vpackssdw   %4, %3, %4              ; %4=data7_6
+
+    vpaddd      %7, %11, %5             ; %7=tmp13_12L+tmp0_1L=data3_2L
+    vpaddd      %8, %12, %6             ; %8=tmp13_12H+tmp0_1H=data3_2H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %2, %7, %8              ; %2=data3_2
+
+    vpsubd      %7, %11, %5             ; %7=tmp13_12L-tmp0_1L=data4_5L
+    vpsubd      %8, %12, %6             ; %8=tmp13_12H-tmp0_1H=data4_5H
+    vpaddd      %7, %7, [rel PD_DESCALE_P %+ %13]
+    vpaddd      %8, %8, [rel PD_DESCALE_P %+ %13]
+    vpsrad      %7, %7, DESCALE_P %+ %13
+    vpsrad      %8, %8, DESCALE_P %+ %13
+    vpackssdw   %3, %7, %8              ; %3=data4_5
+%endmacro
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_avx2)
+
+EXTN(jconst_idct_islow_avx2):
+
+PW_F130_F054_MF130_F054    times 4  dw  (F_0_541 + F_0_765),  F_0_541
+                           times 4  dw  (F_0_541 - F_1_847),  F_0_541
+PW_MF078_F117_F078_F117    times 4  dw  (F_1_175 - F_1_961),  F_1_175
+                           times 4  dw  (F_1_175 - F_0_390),  F_1_175
+PW_MF060_MF089_MF050_MF256 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+                           times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF089_F060_MF256_F050   times 4  dw -F_0_899, (F_1_501 - F_0_899)
+                           times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1              times 8  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2              times 8  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP             times 32 db  CENTERJSAMPLE
+PW_1_NEG1                  times 8  dw  1
+                           times 8  dw -1
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_avx2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_avx2)
+
+EXTN(jsimd_idct_islow_avx2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    push_xmm    4
+    collect_args 4
+
+    ; ---- Pass 1: process columns.
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_AVX2
+    mov         eax, dword [DWBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,r11,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(3,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(4,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(5,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, XMMWORD [XMMBLOCK(6,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm0, xmm0, XMMWORD [XMMBLOCK(7,0,r11,SIZEOF_JCOEF)]
+    vpor        xmm1, xmm1, xmm0
+    vpacksswb   xmm1, xmm1, xmm1
+    vpacksswb   xmm1, xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,r11,SIZEOF_JCOEF)]
+    vpmullw     xmm5, xmm5, XMMWORD [XMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vpsllw      xmm5, xmm5, PASS1_BITS
+
+    vpunpcklwd  xmm4, xmm5, xmm5        ; xmm4=(00 00 01 01 02 02 03 03)
+    vpunpckhwd  xmm5, xmm5, xmm5        ; xmm5=(04 04 05 05 06 06 07 07)
+    vinserti128 ymm4, ymm4, xmm5, 1
+
+    vpshufd     ymm0, ymm4, 0x00        ; ymm0=col0_4=(00 00 00 00 00 00 00 00  04 04 04 04 04 04 04 04)
+    vpshufd     ymm1, ymm4, 0x55        ; ymm1=col1_5=(01 01 01 01 01 01 01 01  05 05 05 05 05 05 05 05)
+    vpshufd     ymm2, ymm4, 0xAA        ; ymm2=col2_6=(02 02 02 02 02 02 02 02  06 06 06 06 06 06 06 06)
+    vpshufd     ymm3, ymm4, 0xFF        ; ymm3=col3_7=(03 03 03 03 03 03 03 03  07 07 07 07 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    vmovdqu     ymm4, YMMWORD [YMMBLOCK(0,0,r11,SIZEOF_JCOEF)]  ; ymm4=in0_1
+    vmovdqu     ymm5, YMMWORD [YMMBLOCK(2,0,r11,SIZEOF_JCOEF)]  ; ymm5=in2_3
+    vmovdqu     ymm6, YMMWORD [YMMBLOCK(4,0,r11,SIZEOF_JCOEF)]  ; ymm6=in4_5
+    vmovdqu     ymm7, YMMWORD [YMMBLOCK(6,0,r11,SIZEOF_JCOEF)]  ; ymm7=in6_7
+    vpmullw     ymm4, ymm4, YMMWORD [YMMBLOCK(0,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm5, ymm5, YMMWORD [YMMBLOCK(2,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm6, ymm6, YMMWORD [YMMBLOCK(4,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+    vpmullw     ymm7, ymm7, YMMWORD [YMMBLOCK(6,0,r10,SIZEOF_ISLOW_MULT_TYPE)]
+
+    vperm2i128  ymm0, ymm4, ymm6, 0x20  ; ymm0=in0_4
+    vperm2i128  ymm1, ymm5, ymm4, 0x31  ; ymm1=in3_1
+    vperm2i128  ymm2, ymm5, ymm7, 0x20  ; ymm2=in2_6
+    vperm2i128  ymm3, ymm7, ymm6, 0x31  ; ymm3=in7_5
+
+    dodct ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 1
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm3=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm3=data3_7
+
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [r11 + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows.
+
+    vperm2i128  ymm4, ymm3, ymm1, 0x31  ; ymm3=in7_5
+    vperm2i128  ymm1, ymm3, ymm1, 0x20  ; ymm1=in3_1
+
+    dodct ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, 2
+    ; ymm0=data0_1, ymm1=data3_2, ymm2=data4_5, ymm4=data7_6
+
+    dotranspose ymm0, ymm1, ymm2, ymm4, ymm3, ymm5, ymm6, ymm7
+    ; ymm0=data0_4, ymm1=data1_5, ymm2=data2_6, ymm4=data3_7
+
+    vpacksswb   ymm0, ymm0, ymm1        ; ymm0=data01_45
+    vpacksswb   ymm1, ymm2, ymm4        ; ymm1=data23_67
+    vpaddb      ymm0, ymm0, [rel PB_CENTERJSAMP]
+    vpaddb      ymm1, ymm1, [rel PB_CENTERJSAMP]
+
+    vextracti128 xmm6, ymm1, 1          ; xmm3=data67
+    vextracti128 xmm4, ymm0, 1          ; xmm2=data45
+    vextracti128 xmm2, ymm1, 0          ; xmm1=data23
+    vextracti128 xmm0, ymm0, 0          ; xmm0=data01
+
+    vpshufd     xmm1, xmm0, 0x4E  ; xmm1=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    vpshufd     xmm3, xmm2, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    vpshufd     xmm5, xmm4, 0x4E  ; xmm5=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    vpshufd     xmm7, xmm6, 0x4E  ; xmm7=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    vzeroupper
+
+    mov         eax, r13d
+
+    mov         rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+
+    mov         rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    mov         rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
+
+    uncollect_args 4
+    pop_xmm     4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctint-sse2.asm b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
new file mode 100644
index 0000000000..7aa869bc0b
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctint-sse2.asm
@@ -0,0 +1,847 @@
+;
+; jidctint.asm - accurate integer IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains a slower but more accurate integer implementation of the
+; inverse DCT (Discrete Cosine Transform). The following code is based
+; directly on the IJG's original jidctint.c; see the jidctint.c for
+; more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS  13
+%define PASS1_BITS  2
+
+%define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+%define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+%if CONST_BITS == 13
+F_0_298 equ  2446  ; FIX(0.298631336)
+F_0_390 equ  3196  ; FIX(0.390180644)
+F_0_541 equ  4433  ; FIX(0.541196100)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_175 equ  9633  ; FIX(1.175875602)
+F_1_501 equ 12299  ; FIX(1.501321110)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_1_961 equ 16069  ; FIX(1.961570560)
+F_2_053 equ 16819  ; FIX(2.053119869)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_072 equ 25172  ; FIX(3.072711026)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
+F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
+F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
+F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
+F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_islow_sse2)
+
+EXTN(jconst_idct_islow_sse2):
+
+PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
+PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
+PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
+PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
+PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
+PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
+PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
+PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
+PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
+PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
+PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients.
+;
+; GLOBAL(void)
+; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
+;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = jpeg_component_info *compptr
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        12
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
+
+EXTN(jsimd_idct_islow_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         near .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, xmm0
+    packsswb    xmm1, xmm1
+    packsswb    xmm1, xmm1
+    movd        eax, xmm1
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm5, PASS1_BITS
+
+    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
+    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
+    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
+    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
+    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
+    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
+    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
+    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
+
+    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Even part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm4, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
+    punpckhwd   xmm5, xmm3
+    movdqa      xmm1, xmm4
+    movdqa      xmm3, xmm5
+    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm3, [rel PW_F054_MF130]  ; xmm3=tmp2H
+
+    movdqa      xmm6, xmm0
+    paddw       xmm0, xmm2              ; xmm0=in0+in4
+    psubw       xmm6, xmm2              ; xmm6=in0-in4
+
+    pxor        xmm7, xmm7
+    pxor        xmm2, xmm2
+    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
+    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
+
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm4              ; xmm7=tmp10L
+    psubd       xmm0, xmm4              ; xmm0=tmp13L
+    movdqa      xmm4, xmm2
+    paddd       xmm2, xmm5              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm7, xmm7
+    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
+    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
+
+    movdqa      xmm2, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm2, xmm1              ; xmm2=tmp12L
+    movdqa      xmm0, xmm7
+    paddd       xmm7, xmm3              ; xmm7=tmp11H
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm7, xmm4
+    paddw       xmm5, xmm3              ; xmm5=z3
+    paddw       xmm7, xmm1              ; xmm7=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm5
+    punpcklwd   xmm2, xmm7
+    punpckhwd   xmm0, xmm7
+    movdqa      xmm5, xmm2
+    movdqa      xmm7, xmm0
+    pmaddwd     xmm2, [rel PW_MF078_F117]  ; xmm2=z3L
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm7, [rel PW_F117_F078]   ; xmm7=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm3
+    punpcklwd   xmm2, xmm4
+    punpckhwd   xmm0, xmm4
+    movdqa      xmm3, xmm2
+    movdqa      xmm4, xmm0
+    pmaddwd     xmm2, [rel PW_MF060_MF089]  ; xmm2=tmp0L
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0H
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3L
+    pmaddwd     xmm4, [rel PW_MF089_F060]   ; xmm4=tmp3H
+
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
+    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
+    paddd       xmm3, xmm5              ; xmm3=tmp3L
+    paddd       xmm4, xmm7              ; xmm4=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
+
+    movdqa      xmm2, xmm1
+    movdqa      xmm0, xmm1
+    punpcklwd   xmm2, xmm6
+    punpckhwd   xmm0, xmm6
+    movdqa      xmm1, xmm2
+    movdqa      xmm6, xmm0
+    pmaddwd     xmm2, [rel PW_MF050_MF256]  ; xmm2=tmp1L
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1H
+    pmaddwd     xmm1, [rel PW_MF256_F050]   ; xmm1=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm2, xmm5              ; xmm2=tmp1L
+    paddd       xmm0, xmm7              ; xmm0=tmp1H
+    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
+
+    movdqa      xmm2, xmm5
+    movdqa      xmm0, xmm7
+    paddd       xmm5, xmm3              ; xmm5=data0L
+    paddd       xmm7, xmm4              ; xmm7=data0H
+    psubd       xmm2, xmm3              ; xmm2=data7L
+    psubd       xmm0, xmm4              ; xmm0=data7H
+
+    movdqa      xmm3, [rel PD_DESCALE_P1]  ; xmm3=[rel PD_DESCALE_P1]
+
+    paddd       xmm5, xmm3
+    paddd       xmm7, xmm3
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm7, DESCALE_P1
+    paddd       xmm2, xmm3
+    paddd       xmm0, xmm3
+    psrad       xmm2, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
+
+    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
+    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
+
+    movdqa      xmm7, xmm4
+    movdqa      xmm0, xmm3
+    paddd       xmm4, xmm1              ; xmm4=data1L
+    paddd       xmm3, xmm6              ; xmm3=data1H
+    psubd       xmm7, xmm1              ; xmm7=data6L
+    psubd       xmm0, xmm6              ; xmm0=data6H
+
+    movdqa      xmm1, [rel PD_DESCALE_P1]  ; xmm1=[rel PD_DESCALE_P1]
+
+    paddd       xmm4, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm4, DESCALE_P1
+    psrad       xmm3, DESCALE_P1
+    paddd       xmm7, xmm1
+    paddd       xmm0, xmm1
+    psrad       xmm7, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+
+    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
+
+    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
+    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
+    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
+    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
+    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
+    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
+    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
+    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
+    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
+    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
+
+    movdqa      xmm5, xmm3
+    movdqa      xmm6, xmm0
+    paddd       xmm3, xmm4              ; xmm3=data2L
+    paddd       xmm0, xmm2              ; xmm0=data2H
+    psubd       xmm5, xmm4              ; xmm5=data5L
+    psubd       xmm6, xmm2              ; xmm6=data5H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1]  ; xmm7=[rel PD_DESCALE_P1]
+
+    paddd       xmm3, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm3, DESCALE_P1
+    psrad       xmm0, DESCALE_P1
+    paddd       xmm5, xmm7
+    paddd       xmm6, xmm7
+    psrad       xmm5, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
+    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
+
+    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
+    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
+    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
+    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm6, xmm4
+    paddd       xmm1, xmm2              ; xmm1=data3L
+    paddd       xmm4, xmm7              ; xmm4=data3H
+    psubd       xmm0, xmm2              ; xmm0=data4L
+    psubd       xmm6, xmm7              ; xmm6=data4H
+
+    movdqa      xmm2, [rel PD_DESCALE_P1]  ; xmm2=[rel PD_DESCALE_P1]
+
+    paddd       xmm1, xmm2
+    paddd       xmm4, xmm2
+    psrad       xmm1, DESCALE_P1
+    psrad       xmm4, DESCALE_P1
+    paddd       xmm0, xmm2
+    paddd       xmm6, xmm2
+    psrad       xmm0, DESCALE_P1
+    psrad       xmm6, DESCALE_P1
+
+    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
+    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
+    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
+
+    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
+    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
+    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
+    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
+    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
+
+    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
+    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
+    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
+    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
+    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
+
+    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
+    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
+
+    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
+    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
+    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
+    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
+    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
+
+    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
+    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
+    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
+    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
+
+    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
+
+    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
+    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
+
+    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
+    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
+    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
+    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
+    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
+
+    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
+    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows from work array, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
+
+    ; (Original)
+    ; z1 = (z2 + z3) * 0.541196100;
+    ; tmp2 = z1 + z3 * -1.847759065;
+    ; tmp3 = z1 + z2 * 0.765366865;
+    ;
+    ; (This implementation)
+    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
+    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
+
+    movdqa      xmm6, xmm1              ; xmm1=in2=z2
+    movdqa      xmm5, xmm1
+    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
+    punpckhwd   xmm5, xmm2
+    movdqa      xmm1, xmm6
+    movdqa      xmm2, xmm5
+    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=tmp3L
+    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
+    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
+    pmaddwd     xmm2, [rel PW_F054_MF130]  ; xmm2=tmp2H
+
+    movdqa      xmm3, xmm7
+    paddw       xmm7, xmm0              ; xmm7=in0+in4
+    psubw       xmm3, xmm0              ; xmm3=in0-in4
+
+    pxor        xmm4, xmm4
+    pxor        xmm0, xmm0
+    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
+    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm6              ; xmm4=tmp10L
+    psubd       xmm7, xmm6              ; xmm7=tmp13L
+    movdqa      xmm6, xmm0
+    paddd       xmm0, xmm5              ; xmm0=tmp10H
+    psubd       xmm6, xmm5              ; xmm6=tmp13H
+
+    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
+    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
+    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
+    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
+
+    pxor        xmm5, xmm5
+    pxor        xmm4, xmm4
+    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
+    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
+    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
+    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
+
+    movdqa      xmm0, xmm5
+    paddd       xmm5, xmm1              ; xmm5=tmp11L
+    psubd       xmm0, xmm1              ; xmm0=tmp12L
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm2              ; xmm4=tmp11H
+    psubd       xmm7, xmm2              ; xmm7=tmp12H
+
+    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
+    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
+    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
+    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
+
+    ; -- Odd part
+
+    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
+    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
+    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
+    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
+
+    movdqa      xmm5, xmm6
+    movdqa      xmm4, xmm3
+    paddw       xmm5, xmm1              ; xmm5=z3
+    paddw       xmm4, xmm2              ; xmm4=z4
+
+    ; (Original)
+    ; z5 = (z3 + z4) * 1.175875602;
+    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+    ; z3 += z5;  z4 += z5;
+    ;
+    ; (This implementation)
+    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm5
+    punpcklwd   xmm0, xmm4
+    punpckhwd   xmm7, xmm4
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm7
+    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3L
+    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3H
+    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
+    pmaddwd     xmm4, [rel PW_F117_F078]   ; xmm4=z4H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
+
+    ; (Original)
+    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
+    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
+    ;
+    ; (This implementation)
+    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+    ; tmp0 += z3;  tmp1 += z4;
+    ; tmp2 += z3;  tmp3 += z4;
+
+    movdqa      xmm0, xmm1
+    movdqa      xmm7, xmm1
+    punpcklwd   xmm0, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm1, xmm0
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0L
+    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp0H
+    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp3L
+    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3H
+
+    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
+    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
+    paddd       xmm1, xmm5              ; xmm1=tmp3L
+    paddd       xmm3, xmm4              ; xmm3=tmp3H
+
+    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
+    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
+
+    movdqa      xmm0, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm0, xmm6
+    punpckhwd   xmm7, xmm6
+    movdqa      xmm2, xmm0
+    movdqa      xmm6, xmm7
+    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1L
+    pmaddwd     xmm7, [rel PW_MF050_MF256]  ; xmm7=tmp1H
+    pmaddwd     xmm2, [rel PW_MF256_F050]   ; xmm2=tmp2L
+    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
+
+    paddd       xmm0, xmm5              ; xmm0=tmp1L
+    paddd       xmm7, xmm4              ; xmm7=tmp1H
+    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
+    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
+
+    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
+    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
+    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
+
+    movdqa      xmm0, xmm5
+    movdqa      xmm7, xmm4
+    paddd       xmm5, xmm1              ; xmm5=data0L
+    paddd       xmm4, xmm3              ; xmm4=data0H
+    psubd       xmm0, xmm1              ; xmm0=data7L
+    psubd       xmm7, xmm3              ; xmm7=data7H
+
+    movdqa      xmm1, [rel PD_DESCALE_P2]  ; xmm1=[rel PD_DESCALE_P2]
+
+    paddd       xmm5, xmm1
+    paddd       xmm4, xmm1
+    psrad       xmm5, DESCALE_P2
+    psrad       xmm4, DESCALE_P2
+    paddd       xmm0, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm0, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
+    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
+
+    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
+    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
+
+    movdqa      xmm4, xmm3
+    movdqa      xmm7, xmm1
+    paddd       xmm3, xmm2              ; xmm3=data1L
+    paddd       xmm1, xmm6              ; xmm1=data1H
+    psubd       xmm4, xmm2              ; xmm4=data6L
+    psubd       xmm7, xmm6              ; xmm7=data6H
+
+    movdqa      xmm2, [rel PD_DESCALE_P2]  ; xmm2=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm2
+    paddd       xmm1, xmm2
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm4, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm7, DESCALE_P2
+
+    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
+    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
+
+    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
+    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
+    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
+    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
+
+    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    movdqa      xmm4, xmm6
+    movdqa      xmm0, xmm2
+    paddd       xmm6, xmm1              ; xmm6=data2L
+    paddd       xmm2, xmm7              ; xmm2=data2H
+    psubd       xmm4, xmm1              ; xmm4=data5L
+    psubd       xmm0, xmm7              ; xmm0=data5H
+
+    movdqa      xmm5, [rel PD_DESCALE_P2]  ; xmm5=[rel PD_DESCALE_P2]
+
+    paddd       xmm6, xmm5
+    paddd       xmm2, xmm5
+    psrad       xmm6, DESCALE_P2
+    psrad       xmm2, DESCALE_P2
+    paddd       xmm4, xmm5
+    paddd       xmm0, xmm5
+    psrad       xmm4, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
+    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
+
+    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
+    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
+    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
+    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
+
+    movdqa      xmm2, xmm3
+    movdqa      xmm0, xmm1
+    paddd       xmm3, xmm7              ; xmm3=data3L
+    paddd       xmm1, xmm5              ; xmm1=data3H
+    psubd       xmm2, xmm7              ; xmm2=data4L
+    psubd       xmm0, xmm5              ; xmm0=data4H
+
+    movdqa      xmm7, [rel PD_DESCALE_P2]  ; xmm7=[rel PD_DESCALE_P2]
+
+    paddd       xmm3, xmm7
+    paddd       xmm1, xmm7
+    psrad       xmm3, DESCALE_P2
+    psrad       xmm1, DESCALE_P2
+    paddd       xmm2, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm2, DESCALE_P2
+    psrad       xmm0, DESCALE_P2
+
+    movdqa      xmm5, [rel PB_CENTERJSAMP]  ; xmm5=[rel PB_CENTERJSAMP]
+
+    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
+    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
+
+    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
+    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
+
+    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
+    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
+
+    paddb       xmm7, xmm5
+    paddb       xmm1, xmm5
+    paddb       xmm6, xmm5
+    paddb       xmm3, xmm5
+
+    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
+    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
+    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
+    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
+    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
+    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
+
+    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
+    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
+    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
+    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
+    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
+    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
+
+    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
+    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
+    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
+    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
+    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
+    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
+    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jidctred-sse2.asm b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
new file mode 100644
index 0000000000..4ece9d891c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jidctred-sse2.asm
@@ -0,0 +1,574 @@
+;
+; jidctred.asm - reduced-size IDCT (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; This file contains inverse-DCT routines that produce reduced-size
+; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
+; The following code is based directly on the IJG's original jidctred.c;
+; see the jidctred.c for more details.
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+
+%define CONST_BITS    13
+%define PASS1_BITS    2
+
+%define DESCALE_P1_4  (CONST_BITS - PASS1_BITS + 1)
+%define DESCALE_P2_4  (CONST_BITS + PASS1_BITS + 3 + 1)
+%define DESCALE_P1_2  (CONST_BITS - PASS1_BITS + 2)
+%define DESCALE_P2_2  (CONST_BITS + PASS1_BITS + 3 + 2)
+
+%if CONST_BITS == 13
+F_0_211 equ  1730  ; FIX(0.211164243)
+F_0_509 equ  4176  ; FIX(0.509795579)
+F_0_601 equ  4926  ; FIX(0.601344887)
+F_0_720 equ  5906  ; FIX(0.720959822)
+F_0_765 equ  6270  ; FIX(0.765366865)
+F_0_850 equ  6967  ; FIX(0.850430095)
+F_0_899 equ  7373  ; FIX(0.899976223)
+F_1_061 equ  8697  ; FIX(1.061594337)
+F_1_272 equ 10426  ; FIX(1.272758580)
+F_1_451 equ 11893  ; FIX(1.451774981)
+F_1_847 equ 15137  ; FIX(1.847759065)
+F_2_172 equ 17799  ; FIX(2.172734803)
+F_2_562 equ 20995  ; FIX(2.562915447)
+F_3_624 equ 29692  ; FIX(3.624509785)
+%else
+; NASM cannot do compile-time arithmetic on floating-point constants.
+%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
+F_0_211 equ DESCALE( 226735879, 30 - CONST_BITS)  ; FIX(0.211164243)
+F_0_509 equ DESCALE( 547388834, 30 - CONST_BITS)  ; FIX(0.509795579)
+F_0_601 equ DESCALE( 645689155, 30 - CONST_BITS)  ; FIX(0.601344887)
+F_0_720 equ DESCALE( 774124714, 30 - CONST_BITS)  ; FIX(0.720959822)
+F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
+F_0_850 equ DESCALE( 913142361, 30 - CONST_BITS)  ; FIX(0.850430095)
+F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
+F_1_061 equ DESCALE(1139878239, 30 - CONST_BITS)  ; FIX(1.061594337)
+F_1_272 equ DESCALE(1366614119, 30 - CONST_BITS)  ; FIX(1.272758580)
+F_1_451 equ DESCALE(1558831516, 30 - CONST_BITS)  ; FIX(1.451774981)
+F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
+F_2_172 equ DESCALE(2332956230, 30 - CONST_BITS)  ; FIX(2.172734803)
+F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
+F_3_624 equ DESCALE(3891787747, 30 - CONST_BITS)  ; FIX(3.624509785)
+%endif
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_CONST
+
+    alignz      32
+    GLOBAL_DATA(jconst_idct_red_sse2)
+
+EXTN(jconst_idct_red_sse2):
+
+PW_F184_MF076   times 4  dw  F_1_847, -F_0_765
+PW_F256_F089    times 4  dw  F_2_562,  F_0_899
+PW_F106_MF217   times 4  dw  F_1_061, -F_2_172
+PW_MF060_MF050  times 4  dw -F_0_601, -F_0_509
+PW_F145_MF021   times 4  dw  F_1_451, -F_0_211
+PW_F362_MF127   times 4  dw  F_3_624, -F_1_272
+PW_F085_MF072   times 4  dw  F_0_850, -F_0_720
+PD_DESCALE_P1_4 times 4  dd  1 << (DESCALE_P1_4 - 1)
+PD_DESCALE_P2_4 times 4  dd  1 << (DESCALE_P2_4 - 1)
+PD_DESCALE_P1_2 times 4  dd  1 << (DESCALE_P1_2 - 1)
+PD_DESCALE_P2_2 times 4  dd  1 << (DESCALE_P2_2 - 1)
+PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
+
+    alignz      32
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 4x4 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_4x4_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+%define original_rbp  rbp + 0
+%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
+                                        ; xmmword wk[WK_NUM]
+%define WK_NUM        2
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_4x4_sse2)
+
+EXTN(jsimd_idct_4x4_sse2):
+    push        rbp
+    mov         rax, rsp                     ; rax = original rbp
+    sub         rsp, byte 4
+    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
+    mov         [rsp], rax
+    mov         rbp, rsp                     ; rbp = aligned rbp
+    lea         rsp, [wk(0)]
+    collect_args 4
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+%ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
+    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    jnz         short .columnDCT
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    por         xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    por         xmm0, xmm1
+    packsswb    xmm0, xmm0
+    packsswb    xmm0, xmm0
+    movd        eax, xmm0
+    test        rax, rax
+    jnz         short .columnDCT
+
+    ; -- AC terms all zero
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    psllw       xmm0, PASS1_BITS
+
+    movdqa      xmm3, xmm0        ; xmm0=in0=(00 01 02 03 04 05 06 07)
+    punpcklwd   xmm0, xmm0        ; xmm0=(00 00 01 01 02 02 03 03)
+    punpckhwd   xmm3, xmm3        ; xmm3=(04 04 05 05 06 06 07 07)
+
+    pshufd      xmm1, xmm0, 0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
+    pshufd      xmm0, xmm0, 0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
+    pshufd      xmm6, xmm3, 0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
+    pshufd      xmm3, xmm3, 0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
+
+    jmp         near .column_end
+%endif
+.columnDCT:
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    movdqa      xmm4, xmm0
+    movdqa      xmm5, xmm0
+    punpcklwd   xmm4, xmm1
+    punpckhwd   xmm5, xmm1
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    pmaddwd     xmm4, [rel PW_F256_F089]   ; xmm4=(tmp2L)
+    pmaddwd     xmm5, [rel PW_F256_F089]   ; xmm5=(tmp2H)
+    pmaddwd     xmm0, [rel PW_F106_MF217]  ; xmm0=(tmp0L)
+    pmaddwd     xmm1, [rel PW_F106_MF217]  ; xmm1=(tmp0H)
+
+    movdqa      xmm6, xmm2
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm6, xmm3
+    punpckhwd   xmm7, xmm3
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2L)
+    pmaddwd     xmm7, [rel PW_MF060_MF050]  ; xmm7=(tmp2H)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0L)
+    pmaddwd     xmm3, [rel PW_F145_MF021]   ; xmm3=(tmp0H)
+
+    paddd       xmm6, xmm4              ; xmm6=tmp2L
+    paddd       xmm7, xmm5              ; xmm7=tmp2H
+    paddd       xmm2, xmm0              ; xmm2=tmp0L
+    paddd       xmm3, xmm1              ; xmm3=tmp0H
+
+    movdqa      XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
+    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
+
+    ; -- Even part
+
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    pxor        xmm1, xmm1
+    pxor        xmm2, xmm2
+    punpcklwd   xmm1, xmm4               ; xmm1=tmp0L
+    punpckhwd   xmm2, xmm4               ; xmm2=tmp0H
+    psrad       xmm1, (16-CONST_BITS-1)  ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
+    psrad       xmm2, (16-CONST_BITS-1)  ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
+
+    movdqa      xmm3, xmm5              ; xmm5=in2=z2
+    punpcklwd   xmm5, xmm0              ; xmm0=in6=z3
+    punpckhwd   xmm3, xmm0
+    pmaddwd     xmm5, [rel PW_F184_MF076]  ; xmm5=tmp2L
+    pmaddwd     xmm3, [rel PW_F184_MF076]  ; xmm3=tmp2H
+
+    movdqa      xmm4, xmm1
+    movdqa      xmm0, xmm2
+    paddd       xmm1, xmm5              ; xmm1=tmp10L
+    paddd       xmm2, xmm3              ; xmm2=tmp10H
+    psubd       xmm4, xmm5              ; xmm4=tmp12L
+    psubd       xmm0, xmm3              ; xmm0=tmp12H
+
+    ; -- Final output stage
+
+    movdqa      xmm5, xmm1
+    movdqa      xmm3, xmm2
+    paddd       xmm1, xmm6              ; xmm1=data0L
+    paddd       xmm2, xmm7              ; xmm2=data0H
+    psubd       xmm5, xmm6              ; xmm5=data3L
+    psubd       xmm3, xmm7              ; xmm3=data3H
+
+    movdqa      xmm6, [rel PD_DESCALE_P1_4]  ; xmm6=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm1, xmm6
+    paddd       xmm2, xmm6
+    psrad       xmm1, DESCALE_P1_4
+    psrad       xmm2, DESCALE_P1_4
+    paddd       xmm5, xmm6
+    paddd       xmm3, xmm6
+    psrad       xmm5, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm1, xmm2              ; xmm1=data0=(00 01 02 03 04 05 06 07)
+    packssdw    xmm5, xmm3              ; xmm5=data3=(30 31 32 33 34 35 36 37)
+
+    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
+    movdqa      xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
+
+    movdqa      xmm2, xmm4
+    movdqa      xmm3, xmm0
+    paddd       xmm4, xmm7              ; xmm4=data1L
+    paddd       xmm0, xmm6              ; xmm0=data1H
+    psubd       xmm2, xmm7              ; xmm2=data2L
+    psubd       xmm3, xmm6              ; xmm3=data2H
+
+    movdqa      xmm7, [rel PD_DESCALE_P1_4]  ; xmm7=[rel PD_DESCALE_P1_4]
+
+    paddd       xmm4, xmm7
+    paddd       xmm0, xmm7
+    psrad       xmm4, DESCALE_P1_4
+    psrad       xmm0, DESCALE_P1_4
+    paddd       xmm2, xmm7
+    paddd       xmm3, xmm7
+    psrad       xmm2, DESCALE_P1_4
+    psrad       xmm3, DESCALE_P1_4
+
+    packssdw    xmm4, xmm0        ; xmm4=data1=(10 11 12 13 14 15 16 17)
+    packssdw    xmm2, xmm3        ; xmm2=data2=(20 21 22 23 24 25 26 27)
+
+    movdqa      xmm6, xmm1        ; transpose coefficients(phase 1)
+    punpcklwd   xmm1, xmm4        ; xmm1=(00 10 01 11 02 12 03 13)
+    punpckhwd   xmm6, xmm4        ; xmm6=(04 14 05 15 06 16 07 17)
+    movdqa      xmm7, xmm2        ; transpose coefficients(phase 1)
+    punpcklwd   xmm2, xmm5        ; xmm2=(20 30 21 31 22 32 23 33)
+    punpckhwd   xmm7, xmm5        ; xmm7=(24 34 25 35 26 36 27 37)
+
+    movdqa      xmm0, xmm1        ; transpose coefficients(phase 2)
+    punpckldq   xmm1, xmm2        ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
+    punpckhdq   xmm0, xmm2        ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
+    movdqa      xmm3, xmm6        ; transpose coefficients(phase 2)
+    punpckldq   xmm6, xmm7        ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
+    punpckhdq   xmm3, xmm7        ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
+.column_end:
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rax, [original_rbp]
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; -- Even part
+
+    pxor        xmm4, xmm4
+    punpcklwd   xmm4, xmm1               ; xmm4=tmp0
+    psrad       xmm4, (16-CONST_BITS-1)  ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
+
+    ; -- Odd part
+
+    punpckhwd   xmm1, xmm0
+    punpckhwd   xmm6, xmm3
+    movdqa      xmm5, xmm1
+    movdqa      xmm2, xmm6
+    pmaddwd     xmm1, [rel PW_F256_F089]    ; xmm1=(tmp2)
+    pmaddwd     xmm6, [rel PW_MF060_MF050]  ; xmm6=(tmp2)
+    pmaddwd     xmm5, [rel PW_F106_MF217]   ; xmm5=(tmp0)
+    pmaddwd     xmm2, [rel PW_F145_MF021]   ; xmm2=(tmp0)
+
+    paddd       xmm6, xmm1              ; xmm6=tmp2
+    paddd       xmm2, xmm5              ; xmm2=tmp0
+
+    ; -- Even part
+
+    punpcklwd   xmm0, xmm3
+    pmaddwd     xmm0, [rel PW_F184_MF076]  ; xmm0=tmp2
+
+    movdqa      xmm7, xmm4
+    paddd       xmm4, xmm0              ; xmm4=tmp10
+    psubd       xmm7, xmm0              ; xmm7=tmp12
+
+    ; -- Final output stage
+
+    movdqa      xmm1, [rel PD_DESCALE_P2_4]  ; xmm1=[rel PD_DESCALE_P2_4]
+
+    movdqa      xmm5, xmm4
+    movdqa      xmm3, xmm7
+    paddd       xmm4, xmm6              ; xmm4=data0=(00 10 20 30)
+    paddd       xmm7, xmm2              ; xmm7=data1=(01 11 21 31)
+    psubd       xmm5, xmm6              ; xmm5=data3=(03 13 23 33)
+    psubd       xmm3, xmm2              ; xmm3=data2=(02 12 22 32)
+
+    paddd       xmm4, xmm1
+    paddd       xmm7, xmm1
+    psrad       xmm4, DESCALE_P2_4
+    psrad       xmm7, DESCALE_P2_4
+    paddd       xmm5, xmm1
+    paddd       xmm3, xmm1
+    psrad       xmm5, DESCALE_P2_4
+    psrad       xmm3, DESCALE_P2_4
+
+    packssdw    xmm4, xmm3              ; xmm4=(00 10 20 30 02 12 22 32)
+    packssdw    xmm7, xmm5              ; xmm7=(01 11 21 31 03 13 23 33)
+
+    movdqa      xmm0, xmm4              ; transpose coefficients(phase 1)
+    punpcklwd   xmm4, xmm7              ; xmm4=(00 01 10 11 20 21 30 31)
+    punpckhwd   xmm0, xmm7              ; xmm0=(02 03 12 13 22 23 32 33)
+
+    movdqa      xmm6, xmm4              ; transpose coefficients(phase 2)
+    punpckldq   xmm4, xmm0              ; xmm4=(00 01 02 03 10 11 12 13)
+    punpckhdq   xmm6, xmm0              ; xmm6=(20 21 22 23 30 31 32 33)
+
+    packsswb    xmm4, xmm6              ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
+    paddb       xmm4, [rel PB_CENTERJSAMP]
+
+    pshufd      xmm2, xmm4, 0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
+    pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
+    pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
+    mov         rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
+    movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
+
+    uncollect_args 4
+    mov         rsp, rbp                ; rsp <- aligned rbp
+    pop         rsp                     ; rsp <- original rbp
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Perform dequantization and inverse DCT on one block of coefficients,
+; producing a reduced-size 2x2 output block.
+;
+; GLOBAL(void)
+; jsimd_idct_2x2_sse2(void *dct_table, JCOEFPTR coef_block,
+;                     JSAMPARRAY output_buf, JDIMENSION output_col)
+;
+
+; r10 = void *dct_table
+; r11 = JCOEFPTR coef_block
+; r12 = JSAMPARRAY output_buf
+; r13d = JDIMENSION output_col
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_idct_2x2_sse2)
+
+EXTN(jsimd_idct_2x2_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 4
+    push        rbx
+
+    ; ---- Pass 1: process columns from input.
+
+    mov         rdx, r10                ; quantptr
+    mov         rsi, r11                ; inptr
+
+    ; | input:                  | result:        |
+    ; | 00 01 ** 03 ** 05 ** 07 |                |
+    ; | 10 11 ** 13 ** 15 ** 17 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
+    ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
+    ; | 50 51 ** 53 ** 55 ** 57 |                |
+    ; | ** ** ** ** ** ** ** ** |                |
+    ; | 70 71 ** 73 ** 75 ** 77 |                |
+
+    ; -- Odd part
+
+    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    movdqa      xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
+    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
+    ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
+
+    pcmpeqd     xmm7, xmm7
+    pslld       xmm7, WORD_BIT          ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
+
+    movdqa      xmm4, xmm0              ; xmm4=(10 11 ** 13 ** 15 ** 17)
+    movdqa      xmm5, xmm2              ; xmm5=(50 51 ** 53 ** 55 ** 57)
+    punpcklwd   xmm4, xmm1              ; xmm4=(10 30 11 31 ** ** 13 33)
+    punpcklwd   xmm5, xmm3              ; xmm5=(50 70 51 71 ** ** 53 73)
+    pmaddwd     xmm4, [rel PW_F362_MF127]
+    pmaddwd     xmm5, [rel PW_F085_MF072]
+
+    psrld       xmm0, WORD_BIT          ; xmm0=(11 -- 13 -- 15 -- 17 --)
+    pand        xmm1, xmm7              ; xmm1=(-- 31 -- 33 -- 35 -- 37)
+    psrld       xmm2, WORD_BIT          ; xmm2=(51 -- 53 -- 55 -- 57 --)
+    pand        xmm3, xmm7              ; xmm3=(-- 71 -- 73 -- 75 -- 77)
+    por         xmm0, xmm1              ; xmm0=(11 31 13 33 15 35 17 37)
+    por         xmm2, xmm3              ; xmm2=(51 71 53 73 55 75 57 77)
+    pmaddwd     xmm0, [rel PW_F362_MF127]
+    pmaddwd     xmm2, [rel PW_F085_MF072]
+
+    paddd       xmm4, xmm5              ; xmm4=tmp0[col0 col1 **** col3]
+    paddd       xmm0, xmm2              ; xmm0=tmp0[col1 col3 col5 col7]
+
+    ; -- Even part
+
+    movdqa      xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
+    pmullw      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
+
+    ; xmm6=(00 01 ** 03 ** 05 ** 07)
+
+    movdqa      xmm1, xmm6              ; xmm1=(00 01 ** 03 ** 05 ** 07)
+    pslld       xmm6, WORD_BIT          ; xmm6=(-- 00 -- ** -- ** -- **)
+    pand        xmm1, xmm7              ; xmm1=(-- 01 -- 03 -- 05 -- 07)
+    psrad       xmm6, (WORD_BIT-CONST_BITS-2)  ; xmm6=tmp10[col0 **** **** ****]
+    psrad       xmm1, (WORD_BIT-CONST_BITS-2)  ; xmm1=tmp10[col1 col3 col5 col7]
+
+    ; -- Final output stage
+
+    movdqa      xmm3, xmm6
+    movdqa      xmm5, xmm1
+    paddd       xmm6, xmm4      ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
+    paddd       xmm1, xmm0      ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
+    psubd       xmm3, xmm4      ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
+    psubd       xmm5, xmm0      ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
+
+    movdqa      xmm2, [rel PD_DESCALE_P1_2]  ; xmm2=[rel PD_DESCALE_P1_2]
+
+    punpckldq   xmm6, xmm3              ; xmm6=(A0 B0 ** **)
+
+    movdqa      xmm7, xmm1
+    punpcklqdq  xmm1, xmm5              ; xmm1=(A1 A3 B1 B3)
+    punpckhqdq  xmm7, xmm5              ; xmm7=(A5 A7 B5 B7)
+
+    paddd       xmm6, xmm2
+    psrad       xmm6, DESCALE_P1_2
+
+    paddd       xmm1, xmm2
+    paddd       xmm7, xmm2
+    psrad       xmm1, DESCALE_P1_2
+    psrad       xmm7, DESCALE_P1_2
+
+    ; -- Prefetch the next coefficient block
+
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
+    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
+
+    ; ---- Pass 2: process rows, store into output array.
+
+    mov         rdi, r12                ; (JSAMPROW *)
+    mov         eax, r13d
+
+    ; | input:| result:|
+    ; | A0 B0 |        |
+    ; | A1 B1 | C0 C1  |
+    ; | A3 B3 | D0 D1  |
+    ; | A5 B5 |        |
+    ; | A7 B7 |        |
+
+    ; -- Odd part
+
+    packssdw    xmm1, xmm1              ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
+    packssdw    xmm7, xmm7              ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
+    pmaddwd     xmm1, [rel PW_F362_MF127]
+    pmaddwd     xmm7, [rel PW_F085_MF072]
+
+    paddd       xmm1, xmm7              ; xmm1=tmp0[row0 row1 row0 row1]
+
+    ; -- Even part
+
+    pslld       xmm6, (CONST_BITS+2)    ; xmm6=tmp10[row0 row1 **** ****]
+
+    ; -- Final output stage
+
+    movdqa      xmm4, xmm6
+    paddd       xmm6, xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
+    psubd       xmm4, xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
+
+    punpckldq   xmm6, xmm4     ; xmm6=(C0 D0 C1 D1)
+
+    paddd       xmm6, [rel PD_DESCALE_P2_2]
+    psrad       xmm6, DESCALE_P2_2
+
+    packssdw    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
+    packsswb    xmm6, xmm6              ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
+    paddb       xmm6, [rel PB_CENTERJSAMP]
+
+    pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
+    pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
+
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
+    mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
+
+    pop         rbx
+    uncollect_args 4
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquantf-sse2.asm b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
new file mode 100644
index 0000000000..ab2e3954f6
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquantf-sse2.asm
@@ -0,0 +1,155 @@
+;
+; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
+
+EXTN(jsimd_convsamp_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7
+    packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/2
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
+
+    psubb       xmm0, xmm7              ; xmm0=(01234567)
+    psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
+
+    punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
+    punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
+
+    punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
+    punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
+    punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
+    punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
+
+    psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
+    psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
+    cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
+    cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
+    psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
+    psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
+    cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
+    cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
+
+    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
+    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
+    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
+    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
+
+    add         rsi, byte 2*SIZEOF_JSAMPROW
+    add         rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; GLOBAL(void)
+; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+;                           FAST_FLOAT *workspace);
+;
+
+; r10 = JCOEFPTR coef_block
+; r11 = FAST_FLOAT *divisors
+; r12 = FAST_FLOAT *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
+
+EXTN(jsimd_quantize_float_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/16
+.quantloop:
+    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
+    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
+    movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
+    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
+    mulps       xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
+
+    cvtps2dq    xmm0, xmm0
+    cvtps2dq    xmm1, xmm1
+    cvtps2dq    xmm2, xmm2
+    cvtps2dq    xmm3, xmm3
+
+    packssdw    xmm0, xmm1
+    packssdw    xmm2, xmm3
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
+
+    add         rsi, byte 16*SIZEOF_FAST_FLOAT
+    add         rdx, byte 16*SIZEOF_FAST_FLOAT
+    add         rdi, byte 16*SIZEOF_JCOEF
+    dec         rax
+    jnz         short .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquanti-avx2.asm b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
new file mode 100644
index 0000000000..70fe81139c
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-avx2.asm
@@ -0,0 +1,163 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit AVX2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, 2018, D. R. Commander.
+; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
+
+EXTN(jsimd_convsamp_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         eax, r11d
+
+    mov         rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    mov         rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
+    pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
+
+    vpmovzxbw   ymm0, xmm0              ; ymm0=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
+    vpmovzxbw   ymm1, xmm1              ; ymm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
+    vpmovzxbw   ymm2, xmm2              ; ymm2=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
+    vpmovzxbw   ymm3, xmm3              ; ymm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
+
+    vpcmpeqw    ymm7, ymm7, ymm7
+    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    vpaddw      ymm0, ymm0, ymm7
+    vpaddw      ymm1, ymm1, ymm7
+    vpaddw      ymm2, ymm2, ymm7
+    vpaddw      ymm3, ymm3, ymm7
+
+    vmovdqu     YMMWORD [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     YMMWORD [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     YMMWORD [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     YMMWORD [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_avx2)
+
+EXTN(jsimd_quantize_avx2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    vmovdqu     ymm4, [YMMBLOCK(0,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm5, [YMMBLOCK(2,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm6, [YMMBLOCK(4,0,r12,SIZEOF_DCTELEM)]
+    vmovdqu     ymm7, [YMMBLOCK(6,0,r12,SIZEOF_DCTELEM)]
+    vpabsw      ymm0, ymm4
+    vpabsw      ymm1, ymm5
+    vpabsw      ymm2, ymm6
+    vpabsw      ymm3, ymm7
+
+    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,r11)]  ; correction + roundfactor
+    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,r11)]
+    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,r11)]
+    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,r11)]  ; reciprocal
+    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,r11)]
+    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,r11)]       ; scale
+    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,r11)]
+    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,r11)]
+    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,r11)]
+
+    vpsignw     ymm0, ymm0, ymm4
+    vpsignw     ymm1, ymm1, ymm5
+    vpsignw     ymm2, ymm2, ymm6
+    vpsignw     ymm3, ymm3, ymm7
+
+    vmovdqu     [YMMBLOCK(0,0,r10,SIZEOF_DCTELEM)], ymm0
+    vmovdqu     [YMMBLOCK(2,0,r10,SIZEOF_DCTELEM)], ymm1
+    vmovdqu     [YMMBLOCK(4,0,r10,SIZEOF_DCTELEM)], ymm2
+    vmovdqu     [YMMBLOCK(6,0,r10,SIZEOF_DCTELEM)], ymm3
+
+    vzeroupper
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jquanti-sse2.asm b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
new file mode 100644
index 0000000000..3ee442027a
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jquanti-sse2.asm
@@ -0,0 +1,188 @@
+;
+; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
+;
+; Based on the x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+%include "jdct.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Load data into workspace, applying unsigned->signed conversion
+;
+; GLOBAL(void)
+; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
+;                     DCTELEM *workspace);
+;
+
+; r10 = JSAMPARRAY sample_data
+; r11d = JDIMENSION start_col
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
+
+EXTN(jsimd_convsamp_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+    push        rbx
+
+    pxor        xmm6, xmm6              ; xmm6=(all 0's)
+    pcmpeqw     xmm7, xmm7
+    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+    mov         rsi, r10
+    mov         eax, r11d
+    mov         rdi, r12
+    mov         rcx, DCTSIZE/4
+.convloop:
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
+    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
+
+    mov         rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+
+    movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
+    movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
+
+    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
+    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
+    paddw       xmm0, xmm7
+    paddw       xmm1, xmm7
+    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
+    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
+    paddw       xmm2, xmm7
+    paddw       xmm3, xmm7
+
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 4*SIZEOF_JSAMPROW
+    add         rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
+    dec         rcx
+    jnz         short .convloop
+
+    pop         rbx
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; --------------------------------------------------------------------------
+;
+; Quantize/descale the coefficients, and store into coef_block
+;
+; This implementation is based on an algorithm described in
+;   "How to optimize for the Pentium family of microprocessors"
+;   (http://www.agner.org/assem/).
+;
+; GLOBAL(void)
+; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
+;                     DCTELEM *workspace);
+;
+
+%define RECIPROCAL(m, n, b) \
+  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
+%define CORRECTION(m, n, b) \
+  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
+%define SCALE(m, n, b) \
+  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
+
+; r10 = JCOEFPTR coef_block
+; r11 = DCTELEM *divisors
+; r12 = DCTELEM *workspace
+
+    align       32
+    GLOBAL_FUNCTION(jsimd_quantize_sse2)
+
+EXTN(jsimd_quantize_sse2):
+    push        rbp
+    mov         rax, rsp
+    mov         rbp, rsp
+    collect_args 3
+
+    mov         rsi, r12
+    mov         rdx, r11
+    mov         rdi, r10
+    mov         rax, DCTSIZE2/32
+.quantloop:
+    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
+    movdqa      xmm0, xmm4
+    movdqa      xmm1, xmm5
+    movdqa      xmm2, xmm6
+    movdqa      xmm3, xmm7
+    psraw       xmm4, (WORD_BIT-1)
+    psraw       xmm5, (WORD_BIT-1)
+    psraw       xmm6, (WORD_BIT-1)
+    psraw       xmm7, (WORD_BIT-1)
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
+    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
+    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
+    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
+
+    paddw       xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
+    paddw       xmm1, XMMWORD [CORRECTION(1,0,rdx)]
+    paddw       xmm2, XMMWORD [CORRECTION(2,0,rdx)]
+    paddw       xmm3, XMMWORD [CORRECTION(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
+    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
+    pmulhuw     xmm0, XMMWORD [SCALE(0,0,rdx)]       ; scale
+    pmulhuw     xmm1, XMMWORD [SCALE(1,0,rdx)]
+    pmulhuw     xmm2, XMMWORD [SCALE(2,0,rdx)]
+    pmulhuw     xmm3, XMMWORD [SCALE(3,0,rdx)]
+
+    pxor        xmm0, xmm4
+    pxor        xmm1, xmm5
+    pxor        xmm2, xmm6
+    pxor        xmm3, xmm7
+    psubw       xmm0, xmm4
+    psubw       xmm1, xmm5
+    psubw       xmm2, xmm6
+    psubw       xmm3, xmm7
+    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
+    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
+    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
+    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
+
+    add         rsi, byte 32*SIZEOF_DCTELEM
+    add         rdx, byte 32*SIZEOF_DCTELEM
+    add         rdi, byte 32*SIZEOF_JCOEF
+    dec         rax
+    jnz         near .quantloop
+
+    uncollect_args 3
+    pop         rbp
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32
diff --git a/media/libjpeg/simd/x86_64/jsimd.c b/media/libjpeg/simd/x86_64/jsimd.c
new file mode 100644
index 0000000000..584a010ad3
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimd.c
@@ -0,0 +1,1068 @@
+/*
+ * jsimd_x86_64.c
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2009-2011, 2014, 2016, 2018, 2022, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library,
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ * For conditions of distribution and use, see copyright notice in jsimdext.inc
+ *
+ * This file contains the interface between the "normal" portions
+ * of the library and the SIMD implementations when running on a
+ * 64-bit x86 architecture.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "jconfigint.h"
+
+/*
+ * In the PIC cases, we have no guarantee that constants will keep
+ * their alignment. This macro allows us to verify it at runtime.
+ */
+#define IS_ALIGNED(ptr, order)  (((size_t)ptr & ((1 << order) - 1)) == 0)
+
+#define IS_ALIGNED_SSE(ptr)  (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
+#define IS_ALIGNED_AVX(ptr)  (IS_ALIGNED(ptr, 5)) /* 32 byte alignment */
+
+static unsigned int simd_support = (unsigned int)(~0);
+static unsigned int simd_huffman = 1;
+
+/*
+ * Check what SIMD accelerations are supported.
+ *
+ * FIXME: This code is racy under a multi-threaded environment.
+ */
+LOCAL(void)
+init_simd(void)
+{
+#ifndef NO_GETENV
+  char env[2] = { 0 };
+#endif
+
+  if (simd_support != ~0U)
+    return;
+
+  simd_support = jpeg_simd_cpu_support();
+
+#ifndef NO_GETENV
+  /* Force different settings through environment variables */
+  if (!GETENV_S(env, 2, "JSIMD_FORCESSE2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_SSE2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCEAVX2") && !strcmp(env, "1"))
+    simd_support &= JSIMD_AVX2;
+  if (!GETENV_S(env, 2, "JSIMD_FORCENONE") && !strcmp(env, "1"))
+    simd_support = 0;
+  if (!GETENV_S(env, 2, "JSIMD_NOHUFFENC") && !strcmp(env, "1"))
+    simd_huffman = 0;
+#endif
+}
+
+GLOBAL(int)
+jsimd_can_rgb_ycc(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_ycc_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_rgb_gray(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_rgb_gray_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_rgb_gray_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_ycc_rgb_convert_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_ycc_rgb565(void)
+{
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                      JSAMPIMAGE output_buf, JDIMENSION output_row,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extrgb_ycc_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_ycc_convert_avx2;
+    sse2fct = jsimd_extrgbx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_ycc_convert_avx2;
+    sse2fct = jsimd_extbgrx_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_ycc_convert_avx2;
+    sse2fct = jsimd_extxbgr_ycc_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_ycc_convert_avx2;
+    sse2fct = jsimd_extxrgb_ycc_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_ycc_convert_avx2;
+    sse2fct = jsimd_rgb_ycc_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
+                       JSAMPIMAGE output_buf, JDIMENSION output_row,
+                       int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+  void (*sse2fct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_extrgb_gray_convert_avx2;
+    sse2fct = jsimd_extrgb_gray_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_extrgbx_gray_convert_avx2;
+    sse2fct = jsimd_extrgbx_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_extbgr_gray_convert_avx2;
+    sse2fct = jsimd_extbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_extbgrx_gray_convert_avx2;
+    sse2fct = jsimd_extbgrx_gray_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_extxbgr_gray_convert_avx2;
+    sse2fct = jsimd_extxbgr_gray_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_extxrgb_gray_convert_avx2;
+    sse2fct = jsimd_extxrgb_gray_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_rgb_gray_convert_avx2;
+    sse2fct = jsimd_rgb_gray_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+  else
+    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                      JDIMENSION input_row, JSAMPARRAY output_buf,
+                      int num_rows)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_ycc_extrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extrgb_convert_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_ycc_extrgbx_convert_avx2;
+    sse2fct = jsimd_ycc_extrgbx_convert_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_ycc_extbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extbgr_convert_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_ycc_extbgrx_convert_avx2;
+    sse2fct = jsimd_ycc_extbgrx_convert_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_ycc_extxbgr_convert_avx2;
+    sse2fct = jsimd_ycc_extxbgr_convert_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_ycc_extxrgb_convert_avx2;
+    sse2fct = jsimd_ycc_extxrgb_convert_sse2;
+    break;
+  default:
+    avx2fct = jsimd_ycc_rgb_convert_avx2;
+    sse2fct = jsimd_ycc_rgb_convert_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+  else
+    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
+}
+
+GLOBAL(void)
+jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                         JDIMENSION input_row, JSAMPARRAY output_buf,
+                         int num_rows)
+{
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_downsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(void)
+jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
+                      JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_downsample_avx2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+  else
+    jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor,
+                               compptr->v_samp_factor,
+                               compptr->width_in_blocks, input_data,
+                               output_data);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                    JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_upsample_avx2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+  else
+    jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, cinfo->output_width,
+                             input_data, output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_fancy_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v2_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v2_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_h2v1_fancy_upsample_avx2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+  else
+    jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
+                                   compptr->downsampled_width, input_data,
+                                   output_data_ptr);
+}
+
+GLOBAL(int)
+jsimd_can_h2v2_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h2v1_merged_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) &&
+      IS_ALIGNED_AVX(jconst_merged_upsample_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) &&
+      IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v2_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v2_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v2_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v2_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v2_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v2_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v2_merged_upsample_avx2;
+    sse2fct = jsimd_h2v2_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(void)
+jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
+                           JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
+{
+  void (*avx2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*sse2fct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    avx2fct = jsimd_h2v1_extrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgb_merged_upsample_sse2;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    avx2fct = jsimd_h2v1_extrgbx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extrgbx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGR:
+    avx2fct = jsimd_h2v1_extbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    avx2fct = jsimd_h2v1_extbgrx_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extbgrx_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    avx2fct = jsimd_h2v1_extxbgr_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxbgr_merged_upsample_sse2;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    avx2fct = jsimd_h2v1_extxrgb_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_extxrgb_merged_upsample_sse2;
+    break;
+  default:
+    avx2fct = jsimd_h2v1_merged_upsample_avx2;
+    sse2fct = jsimd_h2v1_merged_upsample_sse2;
+    break;
+  }
+
+  if (simd_support & JSIMD_AVX2)
+    avx2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+  else
+    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
+}
+
+GLOBAL(int)
+jsimd_can_convsamp(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_convsamp_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_convsamp(JSAMPARRAY sample_data, JDIMENSION start_col,
+               DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_convsamp_avx2(sample_data, start_col, workspace);
+  else
+    jsimd_convsamp_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(void)
+jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_fdct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_fdct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_fdct_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_fdct_islow(DCTELEM *data)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_fdct_islow_avx2(data);
+  else
+    jsimd_fdct_islow_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_ifast(DCTELEM *data)
+{
+  jsimd_fdct_ifast_sse2(data);
+}
+
+GLOBAL(void)
+jsimd_fdct_float(FAST_FLOAT *data)
+{
+  jsimd_fdct_float_sse(data);
+}
+
+GLOBAL(int)
+jsimd_can_quantize(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_AVX2)
+    return 1;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_quantize_float(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_quantize_avx2(coef_block, divisors, workspace);
+  else
+    jsimd_quantize_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(void)
+jsimd_quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
+                     FAST_FLOAT *workspace)
+{
+  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
+}
+
+GLOBAL(int)
+jsimd_can_idct_2x2(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_4x4(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_2x2(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_4x4(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+               JCOEFPTR coef_block, JSAMPARRAY output_buf,
+               JDIMENSION output_col)
+{
+  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
+}
+
+GLOBAL(int)
+jsimd_can_idct_islow(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(ISLOW_MULT_TYPE) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_AVX2) && IS_ALIGNED_AVX(jconst_idct_islow_avx2))
+    return 1;
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_ifast(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_idct_float(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(FAST_FLOAT) != 4)
+    return 0;
+  if (sizeof(FLOAT_MULT_TYPE) != 4)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  if (simd_support & JSIMD_AVX2)
+    jsimd_idct_islow_avx2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+  else
+    jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf,
+                          output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(void)
+jsimd_idct_float(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                 JCOEFPTR coef_block, JSAMPARRAY output_buf,
+                 JDIMENSION output_col)
+{
+  jsimd_idct_float_sse2(compptr->dct_table, coef_block, output_buf,
+                        output_col);
+}
+
+GLOBAL(int)
+jsimd_can_huff_encode_one_block(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if ((simd_support & JSIMD_SSE2) && simd_huffman &&
+      IS_ALIGNED_SSE(jconst_huff_encode_one_block))
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(JOCTET *)
+jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
+                            int last_dc_val, c_derived_tbl *dctbl,
+                            c_derived_tbl *actbl)
+{
+  return jsimd_huff_encode_one_block_sse2(state, buffer, block, last_dc_val,
+                                          dctbl, actbl);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_first_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(void)
+jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
+                                  const int *jpeg_natural_order_start, int Sl,
+                                  int Al, JCOEF *values, size_t *zerobits)
+{
+  jsimd_encode_mcu_AC_first_prepare_sse2(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
+}
+
+GLOBAL(int)
+jsimd_can_encode_mcu_AC_refine_prepare(void)
+{
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (simd_support & JSIMD_SSE2)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
+                                   const int *jpeg_natural_order_start, int Sl,
+                                   int Al, JCOEF *absvalues, size_t *bits)
+{
+  return jsimd_encode_mcu_AC_refine_prepare_sse2(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
+}
diff --git a/media/libjpeg/simd/x86_64/jsimdcpu.asm b/media/libjpeg/simd/x86_64/jsimdcpu.asm
new file mode 100644
index 0000000000..705f813d7d
--- /dev/null
+++ b/media/libjpeg/simd/x86_64/jsimdcpu.asm
@@ -0,0 +1,86 @@
+;
+; jsimdcpu.asm - SIMD instruction support check
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright (C) 2016, D. R. Commander.
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+
+%include "jsimdext.inc"
+
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
+;
+; Check if the CPU supports SIMD instructions
+;
+; GLOBAL(unsigned int)
+; jpeg_simd_cpu_support(void)
+;
+
+    align       32
+    GLOBAL_FUNCTION(jpeg_simd_cpu_support)
+
+EXTN(jpeg_simd_cpu_support):
+    push        rbx
+    push        rdi
+
+    xor         rdi, rdi                ; simd support flag
+
+    ; Assume that all x86-64 processors support SSE & SSE2 instructions
+    or          rdi, JSIMD_SSE2
+    or          rdi, JSIMD_SSE
+
+    ; Check whether CPUID leaf 07H is supported
+    ; (leaf 07H is used to check for AVX2 instruction support)
+    mov         rax, 0
+    cpuid
+    cmp         rax, 7
+    jl          short .return           ; Maximum leaf < 07H
+
+    ; Check for AVX2 instruction support
+    mov         rax, 7
+    xor         rcx, rcx
+    cpuid
+    mov         rax, rbx                ; rax = Extended feature flags
+
+    test        rax, 1<<5               ; bit5:AVX2
+    jz          short .return
+
+    ; Check for AVX2 O/S support
+    mov         rax, 1
+    xor         rcx, rcx
+    cpuid
+    test        rcx, 1<<27
+    jz          short .return           ; O/S does not support XSAVE
+    test        rcx, 1<<28
+    jz          short .return           ; CPU does not support AVX2
+
+    xor         rcx, rcx
+    xgetbv
+    and         rax, 6
+    cmp         rax, 6                  ; O/S does not manage XMM/YMM state
+                                        ; using XSAVE
+    jnz         short .return
+
+    or          rdi, JSIMD_AVX2
+
+.return:
+    mov         rax, rdi
+
+    pop         rdi
+    pop         rbx
+    ret
+
+; For some reason, the OS X linker does not honor the request to align the
+; segment unless we do this.
+    align       32